From 397eac213fafd3aa507d422b37ea9627869f1589 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 16 Dec 2015 16:02:18 -0800
Subject: [PATCH 01/19] first experimental prototype of elementwise tensor op
 in PlusNode::ForwardProp() done except for actual kernel; new methods
 TensorShape::Pad() and Concat(); new method
 ComputationNode::GetTensorsForwardBinary(); moved ElementWiseOperator to
 CommonMatrix.h, using it in TensorView::DoSumOf(); TensorView::m_sob changed
 from ref to pointer to make the object copyable

---
 Source/CNTK/CNTK.cpp                          | 36 +++++------
 Source/Common/Include/DataTensor.h            | 46 +++++++++++---
 .../ComputationNetworkLib/ComputationNode.cpp | 60 +++++++++++++++++++
 .../ComputationNetworkLib/ComputationNode.h   |  3 +-
 .../LinearAlgebraNodes.h                      |  6 ++
 Source/Math/CommonMatrix.h                    | 25 ++++++++
 Source/Math/GPUMatrix.h                       | 10 ----
 Source/Math/TensorView.cpp                    | 14 +++--
 Source/Math/TensorView.h                      | 13 ++--
 9 files changed, 162 insertions(+), 51 deletions(-)
diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp
index 9e4bd127a..f2bd706bc 100644
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@@ -11,25 +11,8 @@
 #define _CRT_NONSTDC_NO_DEPRECATE   // make VS accept POSIX functions without _
 
 #include "stdafx.h"
-#include "Actions.h"
-#include <string>
-#include <chrono>
-#include <algorithm>
-#if defined(_WIN32)
-#include "io.h"
-#endif
-#include "buildinfo.h"
-#include "hostname.h"
-#ifdef LEAKDETECT
-#include "vld.h" // for memory leak detection
-#endif
-#include <vector>
-#include <iostream>
-#include <queue>
-#include <set>
-#include <memory>
-
 #include "Basics.h"
+#include "Actions.h"
 #include "ComputationNetwork.h"
 #include "ComputationNode.h"
 #include "DataReader.h"
@@ -53,6 +36,23 @@
 #include "BrainScriptEvaluator.h"
 #include "BrainScriptParser.h"
 
+#include <string>
+#include <chrono>
+#include <algorithm>
+#if defined(_WIN32)
+#include "io.h"
+#endif
+#include "buildinfo.h"
+#include "hostname.h"
+#ifdef LEAKDETECT
+#include "vld.h" // for memory leak detection
+#endif
+#include <vector>
+#include <iostream>
+#include <queue>
+#include <set>
+#include <memory>
+
 #ifndef let
 #define let const auto
 #endif
diff --git a/Source/Common/Include/DataTensor.h b/Source/Common/Include/DataTensor.h
index 5423a43a0..5bfe4f410 100644
--- a/Source/Common/Include/DataTensor.h
+++ b/Source/Common/Include/DataTensor.h
@@ -107,24 +107,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable.
 
-        void Save(File& fstream) const
+        // verify that this refers to a dense matrix (no strides)
+        void VerifyIsDense() const
         {
             if (m_offset != 0)
-                LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices.");
+                LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int)m_offset);
+            ptrdiff_t mul = 1;
+            for (size_t k = 0; k < m_dims.size(); k++)  // (TODO: we can save one multiplication here)
+            {
+                if (m_steps[k] != mul)
+                    LogicError("TensorShape: A dense TensorShape expected. Dimension %d is not.", (int)k);
+                mul *= (ptrdiff_t)m_dims[k];
+            }
+        }
+
+        void Save(File& fstream) const
+        {
+            VerifyIsDense();
             // saving as 32-bit ints. This allows to continue to support the old format (size_t W, H, C)
             fstream << (uint32_t)m_dims.size();
-            ptrdiff_t mul = 1;
-            for (size_t k = 0; k < m_dims.size(); k++)
+            for (auto dim : m_dims)
             {
-                auto dim = m_dims[k];
                 if (dim > UINT32_MAX)
                     LogicError("TensorShape::Save(): Tensor dimensions %s out of bounds (> 4G).", string(*this).c_str());
                 fstream << (uint32_t)dim;
-                if (m_steps[k] != mul)
-                    LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices.");
-                mul *= (ptrdiff_t)dim;
             }
         }
+
         void Load(File& fstream)
         {
             // format: uint32_t n, dim[0], dim[1], ..., dim[n-1]
@@ -182,6 +191,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 return m_steps[k] == m_steps[k - 1] * (ptrdiff_t)m_dims[k - 1];
         }
 
+        // editing functions
+        // These all create new TensorShape objects.
+        TensorShape Pad(size_t numDims) const               // append singleton dimensions
+        {
+            VerifyIsDense();
+            if (numDims < GetNumDims())
+                LogicError("PadDims: Cannot drop a shorten the dimensions.");
+            else if (numDims == GetNumDims())
+                return *this;
+            auto dims = GetDims();
+            dims.resize(numDims, 1);
+            return TensorShape(dims);
+        }
+        TensorShape Concat(const TensorShape & other) const // concatenate
+        {
+            auto dims = GetDims();
+            auto otherDims = other.GetDims();
+            dims.insert(dims.end(), otherDims.begin(), otherDims.end());
+            return TensorShape(dims);
+        }
+
         // pretty-printing. Returns tensor dims in the form "I x J x K".
         operator std::string() const
         {
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 5302e60c4..a104632f7 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -13,6 +13,8 @@
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
+    using namespace std;
+
     // -----------------------------------------------------------------------
     // subroutines for Validate() implementations
     // -----------------------------------------------------------------------
@@ -138,6 +140,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    // -----------------------------------------------------------------------
+    // tensor helpers
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    static TensorShape GetSampleShape(const ComputationNode<ElemType> * node)
+    {
+        // TODO: use actual ImageLayout. While those are not yet inferred properly, maybe use it if its dims match numRows?
+        if (node->HasMBLayout())                        // if we have a layout, that dimension is not part of the sample shape
+            return TensorShape(node->GetNumRows());
+        else
+            return TensorShape(node->GetNumRows(), node->GetNumCols());
+    }
+
+    template<class ElemType>
+    std::vector<TensorView<ElemType>> ComputationNode<ElemType>::GetTensorsForwardBinary(const FrameRange & fr)
+    {
+        const size_t N = 3;     // 2 inputs and 1 output
+        // BUGBUG: Currently does not interpret actual ImageLayouts or convolutional models.
+        // TODO: move this into a helper function
+        // get tensor shapes
+        vector<ComputationNode<ElemType>*> nodes;
+        for (size_t i = 0; i < N; i++)
+            nodes.push_back(i < N-1 ? Input(i).get() : this);
+        vector<Matrix<ElemType>> values;
+        vector<TensorShape> shapes;
+        for (size_t i = 0; i < N; i++)
+        {
+            values.push_back(nodes[i]->ValueFor(i < N-1 ? fr.AllowBroadcast() : fr));   // no broadcasting for now allowed for output
+            shapes.push_back(GetSampleShape(nodes[i]));
+        }
+        // pad
+        size_t dims = 0;
+        for (size_t i = 0; i < N; i++)
+            if (dims < shapes[i].GetNumDims())
+                dims = shapes[i].GetNumDims();
+        for (size_t i = 0; i < N; i++)
+            shapes[i] = shapes[i].Pad(dims);
+        // concatenate MBLayout dims
+        // TODO: Is it possible that the output has no layout, but inputs have? Then we lost dimensions. Tensor constructor will catch that, though.
+        if (HasMBLayout())
+        {
+            for (size_t i = 0; i < N; i++)
+            {
+                auto sm = nodes[i]->HasMBLayout() ? TensorShape(GetNumParallelSequences(), GetNumTimeSteps()) : TensorShape(1, 1);
+                shapes[i] = shapes[i].Concat(sm);
+            }
+        }
+        // perform operation
+        std::vector<TensorView<ElemType>> tensors;
+        for (size_t i = 0; i < N; i++)
+            tensors.push_back(TensorView<ElemType>(values[i], shapes[i]));
+        return tensors;
+    }
+
     // -----------------------------------------------------------------------
     // others
     // -----------------------------------------------------------------------
@@ -172,6 +229,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<> std::map<size_t, std::map<size_t, FloatMatrix*>>  ComputationNode<float>::s_constOnes{};
     template<> std::map<size_t, std::map<size_t, DoubleMatrix*>> ComputationNode<double>::s_constOnes{};
 
+    template class ComputationNode<float>;
+    template class ComputationNode<double>;
+
     template class LearnableParameter<float>;
     template class LearnableParameter<double>;
 }}}
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index f71c26fd3..3dc6a3f20 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -779,7 +779,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     protected:
         //std containers such as list and map does not support class reference so we need to use pointer
         typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
-        ComputationNode() { }
     public:
         using ComputationNodeBase::AttachInputs;    // import the convenience functions that take 1..6 parameters
         using ComputationNodeBase::SetDims;
@@ -1085,6 +1084,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         const Matrix<ElemType>& Gradient() const { return *m_gradient; }
         Matrix<ElemType>& Gradient()             { return *m_gradient; }
 
+        std::vector<TensorView<ElemType>> GetTensorsForwardBinary(const FrameRange & fr);
+
         // Function to return the number of columns for whole batch or single frame
         size_t GetNumColsFor(const FrameRange & fr/*select frame or entire batch*/)
         {
diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
index b5624b003..79ad40f8e 100644
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@@ -20,6 +20,7 @@
 
 #include "Basics.h"
 #include "Matrix.h"
+#include "TensorView.h"
 #include "ComputationNode.h"
 #include "ConvolutionalNodes.h"
 
@@ -129,6 +130,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override  
         {
+#if 0       // TODO: use #if 0 until this is working
+            auto args = GetTensorsForwardBinary(fr);
+            args[2].DoSumOf(0.0f, args[0], args[1], 1.0f);
+#else
             Matrix<ElemType> functionValues = ValueForToDense(fr, false); // Switch to dense as a work-around because ColumnSlice doesn't support all the sparse formats
             Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
             Matrix<ElemType> inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast());
@@ -185,6 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
             else
                 LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
+#endif
 #if DUMPOUTPUT
             functionValues.Print("PlusNode");
 #endif
diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h
index ceaa74d66..afd5d7d62 100644
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@@ -41,6 +41,28 @@ MATH_API DEVICEID_TYPE EnforceOneGPUOnly(DEVICEID_TYPE requestedDeviceId);
 
 namespace Microsoft { namespace MSR { namespace CNTK {    
 
+    // -----------------------------------------------------------------------
+    // ElementWiseOperator -- This enum represents which function to apply.
+    // This is shared between all matrix types and tensors.
+    // -----------------------------------------------------------------------
+
+    enum ElementWiseOperator
+    {
+        // binary
+        opSum, opDifference, opElementWiseProduct, opElementWiseQuotient,
+        opLogSum, opMax, opMin,
+        opEQ, opNE, opGT, opLT, opGE, opLE,
+        // unary (or binary with constant parameter)
+        opNegate, opNot,
+        opSaturate, opAbs,
+        opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine
+        // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
+    };
+
+    // -----------------------------------------------------------------------
+    // various enums to describe 
+    // -----------------------------------------------------------------------
+
     enum MatrixFlagBitPosition
     {
         bitPosRowMajor = 0, // row major matrix
@@ -76,6 +98,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         matrixFlagSetValueOnDevice = 1<<bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
     };
 
+    // -----------------------------------------------------------------------
+    // BaseMatrix -- base class for all matrix types (CPU, GPU) x (dense, sparse)
+    // -----------------------------------------------------------------------
 
     template<class ElemType>
     class BaseMatrix
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 421959f1a..6b16d3b63 100644
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -71,16 +71,6 @@ namespace Microsoft {
     };
 
 
-    // -----------------------------------------------------------------------
-    // ElementWiseOperator -- This enum represents which function to apply. It needs to be outside of GPUMatrix, because it is also used in GPUSparseMatrix
-    // -----------------------------------------------------------------------
-
-    enum ElementWiseOperator
-    {
-        opSigmoid = 0, opTanh, opSqrt, opExp, opLog, opAbs, opLinearRectifierDerivative, opCosine, opNegativeSine, opSigmoidDerivative
-    };
-
-
     // -----------------------------------------------------------------------
     // GPUMatrix
     // -----------------------------------------------------------------------
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index bc5c75803..1c843c788 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -26,11 +26,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // construction
     // -------------------------------------------------------------------
 
-    // cast a matrix as a tensor
+    // cast a matrix as a TensorView
     template<class ElemType>
     TensorView<ElemType>::TensorView(Matrix<ElemType> & sob) :
-        m_sob(sob), m_shape(TensorShape(array<size_t, 2> { sob.GetNumRows(), sob.GetNumCols() }))
+        m_sob(&sob), m_shape(TensorShape(array<size_t, 2> { sob.GetNumRows(), sob.GetNumCols() }))
     { }
+    // reshape a TensorView
     template<class ElemType>
     TensorView<ElemType>::TensorView(const TensorView<ElemType> & other, const TensorShape & shape) :
         m_sob(other.m_sob), m_shape(shape)
@@ -40,14 +41,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // TODO: Use the multipliers instead?
         size_t i;
         size_t rowDim = 1;
-        for (i = 0; i < m_shape.size() && rowDim < m_sob.GetNumRows(); i++)
+        for (i = 0; i < m_shape.size() && rowDim < m_sob->GetNumRows(); i++)
             rowDim *= m_shape[i];
         // first i dimensions match matrix row dimension
         size_t colDim = 1;
         for (; i < m_shape.size(); i++)
             colDim *= m_shape[i];
-        if (rowDim != m_sob.GetNumRows() || colDim != m_sob.GetNumCols())
-            LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob.GetNumRows(), (int)m_sob.GetNumCols());
+        if (rowDim != m_sob->GetNumRows() || colDim != m_sob->GetNumCols())
+            LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob->GetNumRows(), (int)m_sob->GetNumCols());
     }
 
     // -------------------------------------------------------------------
@@ -57,7 +58,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     static bool Matches(size_t d1, size_t d2) { return d1 == 1 || d2 == 1 || d1 == d2; }    // do two dimensions match?
 
     template<class ElemType>
-    void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/)
+    void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
     {
         TensorView & c = *this;
 
@@ -110,6 +111,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             as[k] *= as[k - 1]; as[k - 1] = 1;
             bs[k] *= bs[k - 1]; bs[k - 1] = 1;
             cs[k] *= cs[k - 1]; cs[k - 1] = 1;
+            os[k] *= os[k - 1]; os[k - 1] = 1;
             // BUGBUG: Must update multipliers as well
         }
 
diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h
index a737e7746..1a8088a70 100644
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@@ -36,17 +36,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         { }
         // copy constructor
         TensorView(const TensorView<ElemType> & other) :
-            TensorView(other.m_sob, other.m_shape)
+            TensorView(*other.m_sob, other.m_shape)
         { }
-        // assignment is forbidden since we contain a reference
-        // If you ever need this, change the reference to a pointer.
-        void operator=(const TensorView & other) = delete;  // since we have a reference
 
         // -------------------------------------------------------------------
         // accessors
         // -------------------------------------------------------------------
 
-        const Matrix<ElemType> & GetSOB() const { return m_sob; }
+        const Matrix<ElemType> & GetSOB() const { return *m_sob; }
         const TensorShape & GetShape() const { return m_shape; }
 
         // -------------------------------------------------------------------
@@ -59,19 +56,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
         // -------------------------------------------------------------------
 
-        void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, 0); }
+        void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::opSum); }
 
         static void Test();
 
     private:
 
-        void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/);
+        void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op);
 
         // -------------------------------------------------------------------
         // sob members
         // -------------------------------------------------------------------
 
-        Matrix<ElemType> & m_sob; // Storage OBject that holds the data that is being viewed with this TensorView
+        Matrix<ElemType> * m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable.
         TensorShape m_shape;            // the meta-data that describes the data's shape and/or access pattern
         // TODO: use a reference here or not? With a reference, we can hide more info in here such as cuDNN handles
     };

From cd6543e46d20e02cb8954babc2e3e2beca14b086 Mon Sep 17 00:00:00 2001
From: yzhang87 <yzhang87@mit.edu>
Date: Thu, 17 Dec 2015 13:55:41 -0500
Subject: [PATCH 02/19] A quick fix to the Kaldi Reader (sequence training,
 need more test!!)

---
 .../Kaldi2Reader/UtteranceDerivativeBuffer.cpp       | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp b/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp
index 730a2b8d4..f845a4ca1 100644
--- a/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp
+++ b/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp
@@ -32,22 +32,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         assert(pMBLayout->GetNumParallelSequences() == m_numUttsPerMinibatch);
         uttInfoInMinibatch->clear();
         uttInfoInMinibatch->resize(uttInfo.size());
+                
         for (size_t i = 0; i < uttInfo.size(); ++i)
         {
             size_t startFrameIndexInMinibatch = 0;
             size_t numFrames = 0;
+
             for (size_t j = 0; j < pMBLayout->GetNumTimeSteps(); ++j)
             {
-                if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
+                /*  if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
                 {
                     continue;
-                }
-                if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoFeature))
+                }*/
+                FrameRange fr(pMBLayout,j);
+
+                if (pMBLayout->IsGap(fr.Sequence(i)))
                 {
                     continue;
                 }
                 numFrames += 1;
-                if (pMBLayout->Is(i, j, MinibatchPackingFlags::SequenceEnd)
+                if (pMBLayout->IsBeyondStartOrEnd(fr.WithTimeOffset((ptrdiff_t) 1).Sequence(i))
                          || j == pMBLayout->GetNumTimeSteps() - 1)
                 {
                     size_t uttIndex = (*uttInfoInMinibatch)[i].size();

From bb6fc1bbe10c9a2db84b52d96b86551d7f0440cd Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 17 Dec 2015 11:33:52 -0800
Subject: [PATCH 03/19] optimized MBLayout::InitAsFrameMode(), short-replacing
 calls to AddSequence() by a much simpler direct initialization for this
 special case; added editing functions to TensorShape, and rewrote
 TensorView::DoBinaryOpOf() to use them

---
 Source/Common/Include/DataTensor.h            | 140 ++++++++++++++----
 Source/Common/Include/Sequences.h             |  89 +++++++----
 .../ComputationNetworkLib/ComputationNode.h   |   2 +-
 Source/Math/Matrix.cpp                        |   1 +
 Source/Math/TensorView.cpp                    | 129 +++++++++-------
 Source/Math/TensorView.h                      |   4 +-
 6 files changed, 243 insertions(+), 122 deletions(-)

diff --git a/Source/Common/Include/DataTensor.h b/Source/Common/Include/DataTensor.h
index 5bfe4f410..0152343d0 100644
--- a/Source/Common/Include/DataTensor.h
+++ b/Source/Common/Include/DataTensor.h
@@ -112,12 +112,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             if (m_offset != 0)
                 LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int)m_offset);
-            ptrdiff_t mul = 1;
             for (size_t k = 0; k < m_dims.size(); k++)  // (TODO: we can save one multiplication here)
             {
-                if (m_steps[k] != mul)
+                ptrdiff_t stride = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
+                if (m_strides[k] != stride)
                     LogicError("TensorShape: A dense TensorShape expected. Dimension %d is not.", (int)k);
-                mul *= (ptrdiff_t)m_dims[k];
             }
         }
 
@@ -163,8 +162,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // accessors
         size_t GetDim(size_t k) const { return m_dims[k]; }
         size_t GetNumDims() const { return m_dims.size(); }
-        size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; }
-        ptrdiff_t GetStep(size_t k) const { return m_steps[k]; }
+        size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; } // in slice
         size_t GetOffset() const { return m_offset; }
 
         // vector-like accessors
@@ -172,12 +170,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         size_t size() const { return GetNumDims(); }
 
         const std::vector<size_t> & GetDims() const { return m_dims; }    // get all, e.g. for logging or for constructing derived tensors with edited dimensions
+        const std::vector<ptrdiff_t> & GetStrides() const { return m_strides; }
 
         // interpretation as an image tensor
         size_t GetNumChannels() const { return m_dims[0]; }
         size_t GetWidth()       const { return m_dims[1]; }
         size_t GetHeight()      const { return m_dims[2]; }
 
+        // indexing
+        // Determines the offset into the underlying element array for a given multi-dimensional index.
+        // This function is for reference. Probably not often used.
+        size_t Locate(const std::vector<size_t> & index) const
+        {
+            ptrdiff_t location = m_offset;
+            for (size_t k = 0; k < index.size(); k++)
+            {
+                size_t dim = k < size() ? m_dims[k] : 1;        // dimensions are bottomless
+                if (index[k] >= dim)
+                    LogicError("Locate: Tensor index[%d]=%d exceeds bound %d.", (int)k, (int)index[k], (int)dim);
+                location += (ptrdiff_t)index[k] * m_strides[k]; // strides may be negative
+            }
+            if (location < 0 || (size_t)location >= m_allocation)
+                LogicError("Locate: Tensor index out of bounds.");
+            return (size_t)location;
+        }
+
         // helpers for tensor operations
         bool CanFlatten(size_t k) const     // can dims k and k-1 be flattened into a single vector? (do they form a matrix without stride)
         {
@@ -188,16 +205,71 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (m_dims[k] == 1 || m_dims[k - 1] == 1)   // both are broadcasting or scalar--we don't care about stride in this case
                 return true;
             else
-                return m_steps[k] == m_steps[k - 1] * (ptrdiff_t)m_dims[k - 1];
+                return m_strides[k] == m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1];
         }
-
         // editing functions
         // These all create new TensorShape objects.
+        TensorShape Flatten(size_t k) const  // flatten [k] with [k-1]
+        {
+            TensorShape result = *this;
+            if (!CanFlatten(k))
+                LogicError("Flatten() cannot flatten dimensions with gaps");
+            // We reshape local (I x J) sub-matrices to (1 x I*J) sub-matrices.
+            // We merge to right so that we can merge multiple by looping left-to-right.
+            //   m_dims    =   I   J    K     L
+            //   m_strides =   1   I    I*J   I*J*K
+            // flattening J and K
+            //   m_dims    =   I   1    J*K   L
+            //   m_strides =   1   I    I     I*J*K
+            // TODO: rethink whether this is correct for example of negative strides
+            result.m_dims[k] *= result.m_dims[k - 1];
+            result.m_dims[k - 1] = 1;
+            result.m_strides[k] = /*result.m_dims[k - 1] *, it's 1 */ result.m_strides[k - 1];
+            return result;
+        }
+        TensorShape DropSingletonDims(const std::vector<bool> & toDrop) const  // flatten [k] with [k-1] if toFlatten[k] is set
+        {
+            TensorShape result = *this;
+            size_t j = 0;
+            for (size_t k = 0; k < size(); k++)
+            {
+                if (toDrop[k])
+                {
+                    if (result.m_dims[k] != 1)
+                        LogicError("DeropSingletonDims() cannot drop non-singleton dimensions.");
+                    else
+                        continue;
+                }
+                else
+                {
+                    // example
+                    //   m_dims    =   I   1    J   K
+                    //   m_strides =   1   I    I   I*J
+                    // dropping the second dimension
+                    //   m_dims    =   I   %    J   K
+                    //   m_strides =   1   %    I   I*J
+                    result.m_dims[j] = result.m_dims[k];
+                    result.m_strides[j] = result.m_strides[k];
+                    j++;
+                }
+            }
+            result.m_dims.resize(j);
+            result.m_strides.resize(j);
+            return result;
+        }
+        TensorShape WithBroadcastStrides() const  // flatten [k] with [k-1] if toFlatten[k] is set
+        {
+            TensorShape result = *this;
+            for (size_t k = 0; k < size(); k++)
+                if (result.m_dims[k] == 1)
+                    result.m_strides[k] = 0;
+            return result;
+        }
         TensorShape Pad(size_t numDims) const               // append singleton dimensions
         {
             VerifyIsDense();
             if (numDims < GetNumDims())
-                LogicError("PadDims: Cannot drop a shorten the dimensions.");
+                LogicError("Pad() cannot drop a shorten the dimensions.");
             else if (numDims == GetNumDims())
                 return *this;
             auto dims = GetDims();
@@ -216,59 +288,65 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         operator std::string() const
         {
             std::string s;
-            for (const auto & dim : m_dims)
+            for (size_t k = 0; k < size(); k++)
             {
                 if (!s.empty())
                     s.append(" x ");
-                s.append(std::to_string(dim));
+                s.append(std::to_string(m_dims[k]));
             }
+#ifdef _DEBUG   // also emit the strides, easier for debugging
+            s.append(" {");
+            for (size_t k = 0; k < size(); k++)
+            {
+                if (k > 0)
+                    s.append(",");
+                s.append(std::to_string(m_strides[k]));
+            }
+            s.append("}");
+#endif
             return s;
         }
 
     private:
-        // reset m_steps and m_offset to represent a canonical no-strides tensor
+        // reset m_strides and m_offset to represent a canonical no-strides tensor
         void InitAsNoSlice()
         {
             m_offset = 0;
-            m_steps.resize(m_dims.size());
-            ptrdiff_t mul = 1;
+            m_strides.resize(m_dims.size());
             for (size_t k = 0; k < m_dims.size(); k++)
-            {
-                m_steps[k] = (ptrdiff_t)mul;
-                mul *= m_dims[k];
-            }
+                m_strides[k] = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
+            m_allocation = m_dims.empty() ? 0 : m_dims.back() * (size_t)m_strides.back();
         }
 
     private:
         std::vector<size_t> m_dims;     // dimensions of tensor or tensor slice. The size of the box.
-        std::vector<ptrdiff_t> m_steps; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here!
+        std::vector<ptrdiff_t> m_strides; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here!
         size_t m_offset;                // offset to element(0,0,...,0). May be non-0 in case of slicing.
-        // For a regular tensor, there are no strides, m_steps[k] = m_steps[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions.
+        size_t m_allocation;            // allocation size of original dense tensor
+        // For a regular tensor, there are no strides, m_strides[k] = m_strides[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions.
         // For views into existing tensors, we do stride shenanigans to implement broadcasting (plus magic tricks). Examples:
         // To traverse a 5 x 10 matrix with column order reversed:
         //  - op.dims = (5 x 10)
         //  - m_offset points to element (0,9)
-        //  - m_steps[0] = 1            // regular forward iteration within each column
-        //  - m_steps[1] = -5           // backward iteration over columns
+        //  - m_strides = (1, -5)       // backward iteration over columns
         // To compute matrix C(13 x 42) = vector A(13 x 1) + matrix B(13 x 42):
         //  - op = sum
         //  - op.dims = (13 x 42)
-        //  - *.m_steps[0] = 1          // forward iteration through each column
-        //  - C.m_steps[1] = 13         // forward iteration over columns of B--defines the for loop
-        //  - B.m_steps[1] = 13         // forward iteration over columns of B--iterates in sync with C
-        //  - A.m_steps[1] = 0          // A, however, is stuck in column 0 forever
+        //  - C.m_strides = (1, 13)     // forward iteration over columns of B--defines the for loop
+        //  - B.m_strides = (1, 13)     // forward iteration over columns of B--iterates in sync with C
+        //  - A.m_strides = (1, 0)      // A, however, is stuck in column 0 forever
         // Matrix product: C(I x K) = A(I x J) * B(J x K)   --Note: Likely not RAM-bandwidth efficient!
         //  - op = mul
         //  - op.dims   = (I x J x K)   // iteration dimensions
-        //  - C.m_steps = (1, 0, I)     // inverse broadcasting for inner dimension
-        //  - A.m_steps = (1, I, 0)
-        //  - B.m_steps = (0, 1, J)
+        //  - C.m_strides = (1, 0, I)   // inverse broadcasting for inner dimension
+        //  - A.m_strides = (1, I, 0)
+        //  - B.m_strides = (0, 1, J)
         // Convolution of time signals (without padding): Y(T-N+1) = X(T) * H(N):   --Note: Likely not RAM-bandwidth efficient!
         //  - op = mul
         //  - op.dims   = (T-N+1 x N)   // iteration dimensions
-        //  - Y.m_steps = (1, 0)        // inverse broadcasting: this sums up the individual products
-        //  - X.m_steps = (1, 1)        // shift window by 1 for each output sample
-        //  - H.m_steps = (0, -1)       // reuse for each output sample; iterate in reverse order for convolution
+        //  - Y.m_strides = (1, 0)      // inverse broadcasting: this sums up the individual products
+        //  - X.m_strides = (1, 1)      // shift window by 1 for each output sample
+        //  - H.m_strides = (0, -1)     // reuse for each output sample; iterate in reverse order for convolution
         //  - H.m_offset = N - 1        // begin with last element (reverse order for convolution)
         // TODO: double-check all these
         // TODO: Does the same trick work for 2D images?
diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 95f9a8e38..15484458f 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -108,12 +108,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_numParallelSequences = numParallelSequences;
             m_numTimeSteps = numTimeSteps;
             // allocate lookup tables (note: except at the start, these don't really allocate new memory most of the time)
-            // PTRDIFF_MAX indicates not initialized (also in the matrix, which is stored as float).
-            m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToStart.SetValue((float)PTRDIFF_MAX);
-            m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToEnd.SetValue((float)PTRDIFF_MAX);
-            m_distanceToNearestStart.assign(m_numTimeSteps, PTRDIFF_MAX);
-            m_distanceToNearestEnd.assign(m_numTimeSteps, PTRDIFF_MAX);
+#if 1
+            if (m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps)   // sanity check for debugging a regression
+                fprintf(stderr, "MBLayout::Init: Resizing m_distanceToStart from %d x %d to %d x %d\n",
+                        (int)m_distanceToStart.GetNumRows(), (int)m_distanceToStart.GetNumCols(), (int)m_numParallelSequences, (int)m_numTimeSteps); // (I really want to know about actual allocations, but this is a necessary condition for them)
+#endif
+            m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps);
+            m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps);
+            m_distanceToNearestStart.assign(m_numTimeSteps, SIZE_MAX);
+            m_distanceToNearestEnd.assign(m_numTimeSteps, SIZE_MAX);
             m_timeStepHasGap.assign(m_numTimeSteps, false);
+            m_columnsValidityMask.Resize(0, 0);     // invalidate
             // reset state
             m_numFramesDeclared = 0;
             m_numGapFrames = 0;
@@ -121,20 +126,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_writable = true;
         }
 
-        // short-hand to initialize an MBLayout for the common case of frame mode
-        // In frame mode, there is one parallel "sequence" per sample, which is 1 frame long.
-        void InitAsFrameMode(size_t numSamples)
-        {
-            Init(numSamples, 1);
-            SequenceInfo seqInfo { 0, 0, 0, 1 };
-            for (size_t s = 0; s < numSamples; s++)
-            {
-                seqInfo.seqId = seqInfo.s = s;
-                AddSequence(seqInfo);
-            }
-            Lock();
-        }
-
         // -------------------------------------------------------------------
         // accessors
         // -------------------------------------------------------------------
@@ -199,7 +190,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch.");
 
             // remember it
+#if 1
+            auto cap = m_sequences.capacity();  // some sanity check for debugging a speed regression
             m_sequences.push_back(seqDesc);
+            if (cap != m_sequences.capacity())
+                fprintf(stderr, "AddSequence: m_sequences was reallocated from capacity %d to %d\n", (int)cap, (int)m_sequences.capacity());
+#else
+            m_sequences.push_back(seqDesc);
+#endif
 
             // create all the cached fast-lookup information
             const auto seqId = seqDesc.seqId;
@@ -212,7 +210,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 m_numGapFrames += (e - b);
                 for (size_t t = b; t < e; t++)
                 {
-                    //Set(s, t, MinibatchPackingFlags::NoInput);
                     m_timeStepHasGap[t] = true;
                     m_distanceToStart(s, t) = -1;   // start flags also encode gaps
                 }
@@ -220,22 +217,49 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             else for (size_t t = b; t < e; t++)
             {
                 // update the nearest sentence boundaries, minimum over all parallel sequences
-                // -1 in distanceToStart(,) stands for a gap
-                assert(m_distanceToStart(s, t) != -1);  // gaps not allowed to overlap
                 // If 0, then we are on a boundary. If not 0, we can still test in presence of FrameRange.m_timeOffset.
-                ptrdiff_t distanceToStart = t - beginTime;
-                if (m_distanceToStart(s, t) > (float)distanceToStart)
-                    m_distanceToStart(s, t) = (float)distanceToStart;
+                size_t distanceToStart = (size_t)((ptrdiff_t)t - beginTime);
+                size_t distanceToEnd = endTime - 1 - t;
+                m_distanceToStart(s, t) = (float)distanceToStart;
+                m_distanceToEnd(s, t) = (float)distanceToEnd;
+                // and the aggregate
                 if (m_distanceToNearestStart[t] > distanceToStart)
                     m_distanceToNearestStart[t] = distanceToStart;
-                ptrdiff_t distanceToEnd = endTime - 1 - t;
-                if (m_distanceToEnd(s, t) > (float) distanceToEnd)
-                    m_distanceToEnd(s, t) = (float) distanceToEnd;
                 if (m_distanceToNearestEnd[t] > distanceToEnd)
                     m_distanceToNearestEnd[t] = distanceToEnd;
             }
         }
 
+        // short-hand to initialize an MBLayout for the common case of frame mode
+        // In frame mode, there is one parallel "sequence" per sample, which is 1 frame long.
+        // This function provides an efficient short-cut implementation of AddSequence(t, t, 0, 1) for every sample t.
+        void InitAsFrameMode(size_t numSamples)
+        {
+            Init(numSamples, 1);
+
+            // create sequences array
+            SequenceInfo virginSeqInfo = { 0, 0, 0, 1 };
+            m_sequences.resize(numSamples, virginSeqInfo);  // pass it here since otherwise STL will initialize everything to 0 unnecessarily
+
+            // update sequence indices
+            for (size_t s = 0; s < numSamples; s++)
+            {
+                // remember it
+                auto & seqDesc = m_sequences[s];
+                seqDesc.seqId = s;
+                seqDesc.s = s;
+            }
+            m_numFramesDeclared = numSamples;
+
+            // create all the cached fast-lookup information
+            m_distanceToStart.SetValue(0);
+            m_distanceToEnd.SetValue(0);
+            m_distanceToNearestStart[0] = 0;
+            m_distanceToNearestEnd[0] = 0;
+
+            Lock();
+        }
+
         // mark a range of frames in a parallel sequence as invalid
         // I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them.
         void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); }
@@ -330,10 +354,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //                              2  1  0  .  . ]          // (last two time steps undefined)
         // m_distanceToNearestStart = [ 0  1  2  3  4 ]
         // m_distanceToNearestEnd   = [ 2  1  0  1  0 ]
-        Matrix<float> m_distanceToStart, m_distanceToEnd;                   // (s,t); value<0 stands for gap, PTRDIFF_MAX for 'not initialized'
-        vector<ptrdiff_t> m_distanceToNearestStart, m_distanceToNearestEnd; // [t]    (value<0 does NOT stand for gap; consult m_timeStepHasGap[] vector instead)
+        Matrix<float> m_distanceToStart, m_distanceToEnd;                   // (s,t); value<0 stands for gap
+        vector<size_t> m_distanceToNearestStart, m_distanceToNearestEnd;    // [t]    (does not store info about gaps; consult m_timeStepHasGap[] vector instead)
 
-        vector<bool> m_timeStepHasGap;                                      // [t]
+        vector<bool> m_timeStepHasGap;                                      // [t] true if at least one gap in time step t
 
         // Cached mask indicating the validity of each column in the MBLayout
         // TODO: We actually just need a boolean matrix for this.
@@ -527,6 +551,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (s == SIZE_MAX)                      // aggregate requested
         {
             // determine flags from aggregate vectors
+            assert(m_distanceToNearestStart[t] != SIZE_MAX);    // (sanity check)
             auto distanceToStart = (ptrdiff_t)m_distanceToNearestStart[t];
             if (distanceToStart < -fr.m_timeOffset)
                 return true;
@@ -557,7 +582,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // TODO: Remove this version (with sanity checks) after this has been tested. Then the function can be inlined above.
     inline size_t MBLayout::GetActualNumSamples() const
     {
-#if 1       // sanity check  --TODO: delete this after a while
+#if 0       // sanity check  --TODO: delete this after a while
         size_t n = GetNumCols();
         if (HasGaps())
         {
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 3dc6a3f20..7f89060a6 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -1520,7 +1520,7 @@ protected: \
     using Base::CreateUniqId; \
     using Base::GetNumInputs; using Base::ZeroGradientsOfInputs; using Base::VerifyDims; \
     using Base::ConstOnes; \
-    using Base::GetImageLayout; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \
+    using Base::GetImageLayout; using Base::GetTensorsForwardBinary; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \
     using Base::CopyTo; using Base::CreateUniqNodeName; using Base::DetachInputs; using Base::GetInputsFromConfig; \
     using Base::DumpNodeInfo; using Base::EnumerateNodes; \
     using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 7f644687e..0bd30e22b 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -5205,5 +5205,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template void Matrix<char>::SetValue(const char);
     template void Matrix<char>::SetValue(size_t numRows, const size_t numCols, int deviceId, char *pArray, size_t matrixFlags);
     template bool Matrix<char>::IsEmpty() const;
+    template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);
 
 }}}
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index 1c843c788..21fab4559 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -60,92 +60,109 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
     {
+#define N 3     // later make this a template parameter. N=1 is possible for generators, such as constants.
+        array<TensorShape, N> shapes;
         TensorView & c = *this;
 
-        // TODO: Turn the inner meat here into a function template using a std::array<., N-nariness>. Nullary ops are generators, e.g. constants.
+        shapes[0] = a.GetShape();
+        shapes[1] = b.GetShape();
+        shapes[2] = c.GetShape();       // last one is the output
 
         // massage TensorShapes
         // Note that TensorShapes here may be shapes are stored or shapes with stride magic applied.
-        auto as = a.GetShape().GetDims();
-        auto bs = b.GetShape().GetDims();
-        auto cs = c.GetShape().GetDims();
 
         // expand ones to make tensors compatible
         // Trailing dimensions broadcast.
         // E.g. A(J) vs. B(J x T) will broadcast A(:) to all T columns.
         // To broadcast an A(T) to all J rows of B, use TensorShape editing to insert a dimension to get A(1,T).
-        auto dims = max(max(as.size(), bs.size()), cs.size());
-        as.resize(dims, 1);
-        bs.resize(dims, 1);
-        cs.resize(dims, 1);
+        size_t dims = 0;
+        for (size_t i = 0; i < N; i++)
+            if (dims < shapes[i].GetNumDims())
+                dims = shapes[i].GetNumDims();
+        for (size_t i = 0; i < N; i++)
+            shapes[i] = shapes[i].Pad(dims);
 
         // determine operation shape (max over all dimensions)
-        decltype(as) os(dims);
+        vector<size_t> opDims(dims, 0);
         for (size_t k = 0; k < dims; k++)
-            os[k] = max(max(as[k], bs[k]), cs[k]);
+            for (size_t i = 0; i < N; i++)
+                opDims[k] = max(opDims[k], shapes[i][k]);
 
         // dimension compatibility check
         // Each participant can broadcast. Non-broadcasting dimensions must match the operation dimension.
         for (size_t k = 0; k < dims; k++)
-        {
-            if (!Matches(as[k], os[k]) || !Matches(bs[k], os[k]) || !Matches(cs[k], os[k]))
-                InvalidArgument("Binary tensor operation: Dimension %d is incompatible between the two inputs and output (%d vs. %d vs. %d)", (int)dims, (int)as[k], (int)bs[k], (int)cs[k]);
-        }
+            for (size_t i = 0; i < N; i++)
+                if (!Matches(shapes[i][k], opDims[k]))
+                    InvalidArgument("Binary tensor operation: Dimension %d is incompatible between input %d and output (%s vs. %s)", (int)k, (int)shapes[i][k], string(shapes[i]).c_str(), string(TensorShape(opDims)).c_str());
 
         // flatten consecutive dimensions
         // Dimensions must be consecutive in memory, and either non-broadcasting or all-broadcasting, across all dimensions.
         // After this, as, bs, and cs no longer match the TensorShape objects.
+        fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
         for (size_t k = 1; k < dims; k++)
         {
-            // check if stored without gaps to skip
-            if (!a.GetShape().CanFlatten(k) || !b.GetShape().CanFlatten(k) || !c.GetShape().CanFlatten(k))
-                continue;
-            // check if they are either all broadcasting or all not broadcasting
-            if ((as[k] != os[k] || as[k - 1] != os[k - 1]) && (as[k] != 1 || as[k - 1] != 1))
-                continue;
-            if ((bs[k] != os[k] || bs[k - 1] != os[k - 1]) && (bs[k] != 1 || bs[k - 1] != 1))
-                continue;
-            if ((cs[k] != os[k] || cs[k - 1] != os[k - 1]) && (cs[k] != 1 || cs[k - 1] != 1))
-                continue;
-            // merge the dimensions
-            as[k] *= as[k - 1]; as[k - 1] = 1;
-            bs[k] *= bs[k - 1]; bs[k - 1] = 1;
-            cs[k] *= cs[k - 1]; cs[k - 1] = 1;
-            os[k] *= os[k - 1]; os[k - 1] = 1;
-            // BUGBUG: Must update multipliers as well
+            for (size_t i = 0; i < N; i++)
+            {
+                // check if stored without gaps to skip
+                if (!shapes[i].CanFlatten(k))
+                    goto nope;
+                // check if they are either all broadcasting or all not broadcasting
+                if ((shapes[i][k] != opDims[k] || shapes[i][k - 1] != opDims[k - 1]) && (shapes[i][k] != 1 || shapes[i][k - 1] != 1))
+                    goto nope;
+            }
+            // these dimensions can be merged
+            for (size_t i = 0; i < N; i++)
+                shapes[i] = shapes[i].Flatten(k);               // TODO: overdoing the immutable thingy much?
+            opDims = TensorShape(opDims).Flatten(k).GetDims();  // (ugh)
+        nope:;
         }
+        fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
 
         // remove singleton dimensions
-        size_t j = 0;
+        vector<bool> toDrop(dims, false);
         for (size_t k = 0; k < dims; k++)
         {
-            if (as[k] == 1 && bs[k] == 1 && cs[k] == 1) // skip all-singleton dimensions
-                continue;
-            as[j] = as[k];
-            bs[j] = bs[k];
-            cs[j] = cs[k];
-            os[j] = os[k];
-            j++;
+            for (size_t i = 0; i < N; i++)
+                if (shapes[i][k] != 1)
+                    goto neither;
+            toDrop[k] = true;           // found an all-singleton dimensions
+        neither:;
         }
-        // note: if op is a scalar, then we end up with 0 dimensions here
-        dims = j;
-        as.resize(dims);
-        bs.resize(dims);
-        cs.resize(dims);
-        os.resize(dims);
-        let as1 = TensorShape(as);   // BUGBUG: We just lost stride info.
-        let bs1 = TensorShape(bs);
-        let cs1 = TensorShape(cs);
-        let os1 = TensorShape(os);
+        for (size_t i = 0; i < N; i++)
+            shapes[i] = shapes[i].DropSingletonDims(toDrop);
+        opDims = TensorShape(opDims).DropSingletonDims(toDrop).GetDims();    // (ugh)
+        // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed
+        fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
+
+        // determine broadcasting; that is, set strides to 0 for 1-dimensions
+        // To be more precise, we should only set actually broadcasting dimensions to 0.
+        // But since dimensions that are 1 across all args are eliminated, any 1 must be some form of broadcasting.
+        // TODO: Do we need to allow other strides at this point in time? If not, broadcasting becomes a bit vector.
+        for (size_t i = 0; i < N; i++)
+            shapes[i] = shapes[i].WithBroadcastStrides();
 
         // determine inverse broadcasting dimensions
-        // TODO: describe the resulting for loop as a set of tensor dims and strides as well.
-        vector<bool> cBroadcasts(dims);
-        for (size_t k = 0; k < dims; k++)
-            cBroadcasts[k] = cs1[k] == 1 && (as1[k] != 1 || bs1[k] != 1);
+        // Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index.
+        // For regular input dims:
+        //  - determine number of steps (product over opDims[.])
+        //  - launch that many kernels
+        //  - pass in:
+        //     - total number of steps
+        //     - strides for all inputs (with stride magic), separated by regular and inverse broadcasting dimensions
+        //     - opDim (no stride magic allowed) for regular broadcasting dimensions
+        //     - reverse broadcasting dimensions
+        //     - opcodes for elementwise op and reduction op
+        //  - in each kernel:
+        //     - map thread index to dimensions (regular broadcasting ones)
+        //     - for-loop over inverse broadcasting dimensions
+        //        - map dimensions (including inverse broadcasting) for every input
+        //        - perform op on the input values
+        //        - accumulate
+        //     - map dimensions (regular) for output
+        //     - save result
 
         // now perform the operation
-        fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(as1).c_str(), string(bs1).c_str(), string(cs1).c_str(), string(os1).c_str());
+        fprintf(stderr, "Op %d: %s  op  %s  ->  %s  via  %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
         // :)
         beta; alpha;
     }
@@ -155,9 +172,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     /*static*/ void TensorView<ElemType>::Test()
     {
-        Matrix<ElemType> m1(0); m1.Resize(1, 42);
-        Matrix<ElemType> m2(0); m2.Resize(13, 1);
-        Matrix<ElemType> m3(0); m3.Resize(13, 21);
+        Matrix<ElemType> m1(-1); m1.Resize(1, 42);
+        Matrix<ElemType> m2(-1); m2.Resize(13, 1);
+        Matrix<ElemType> m3(-1); m3.Resize(13, 21);
         TensorShape s1(1, 2, 21);
         TensorShape s2(13, 1);
         TensorShape s3(13, 1, 21);
diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h
index 1a8088a70..be037fa5b 100644
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@@ -68,8 +68,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // sob members
         // -------------------------------------------------------------------
 
-        Matrix<ElemType> * m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable.
-        TensorShape m_shape;            // the meta-data that describes the data's shape and/or access pattern
+        Matrix<ElemType> * m_sob;   // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable.
+        TensorShape m_shape;        // the meta-data that describes the data's shape and/or access pattern
         // TODO: use a reference here or not? With a reference, we can hide more info in here such as cuDNN handles
     };
 

From e6040d050dbbe11b937e2222eeea4d335a562731 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 17 Dec 2015 11:35:28 -0800
Subject: [PATCH 04/19] made Linux build happy (missing explicit method
 template specialization of CPUMatrix<char>::Resize())

---
 Source/Math/CPUMatrix.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index ba1bc077d..937f642c5 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -5551,5 +5551,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template void CPUMatrix<char>::SetValue(const char);
     template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char *pArray, size_t matrixFlags);
     template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
+    template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
 
 }}}

From aa5d1a7213880b4b00eafc1bb09002fbfdc4b08b Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 17 Dec 2015 15:50:00 -0800
Subject: [PATCH 05/19] implemented plumbing and first shot for TensorView
 operation with reduction

---
 Source/Math/CPUMatrix.cpp  | 137 +++++++++++++++++
 Source/Math/CPUMatrix.h    |   5 +
 Source/Math/Matrix.cpp     | 301 +++++++++++++++++++------------------
 Source/Math/Matrix.h       |   7 +-
 Source/Math/TensorView.cpp |  35 ++++-
 Source/Math/TensorView.h   |   2 +-
 6 files changed, 334 insertions(+), 153 deletions(-)

diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 937f642c5..092cd8c9e 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -5533,6 +5533,143 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return numThreads;
     }
 
+    // -----------------------------------------------------------------------
+    // TensorView support
+    // -----------------------------------------------------------------------
+
+    // perform loop over reduction index m
+    // This function is declared inside a wrapper struct to allow partial specialization (m = -1).
+    template<class ElemType, size_t N, typename OPFN, int m>
+    struct TensorOpReduction
+    {
+        // reduction case (non-reduction case is specialized)
+        static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
+                                    const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+        {
+            array<ptrdiff_t, N> strides;
+            for (size_t i = 0; i < N; i++)      // N = a small constant, this will be unrolled
+                strides[i] = reducingStrides[i][(size_t)m];
+            ElemType aggregate = 0;
+            for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;)
+            {
+                // need to descend into one loop deeper
+                aggregate += TensorOpReduction<ElemType, N, OPFN, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
+                // advance the pointers
+                for (size_t i = 0; i < N; i++)
+                    pointers[i] += strides[i];
+            }
+            return aggregate;
+        }
+    };
+
+    // perform loop over reduction index m
+    // This is the specialized version for m = -1, which terminates the recursion.
+    template<class ElemType, size_t N, typename OPFN>
+    struct TensorOpReduction<ElemType, N, OPFN, -1>
+    {
+        static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
+                                    const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &)
+        {
+            return opfn(pointers);          // finally we are doing some work!!!
+        }
+    };
+
+    // perform loop over regular index k and reducing index m for N operands (counting the output)
+    template<class ElemType, size_t N, typename OPFN, int m, int k>
+    struct TensorOpIteration
+    {
+        static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+        {
+            // non-scalar case: still nested result loops left
+            array<ptrdiff_t, N> strides;
+            for (size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
+                strides[i] = regularStrides[i][(size_t)k];
+            for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;)
+            {
+                // need to descend into one loop deeper
+                TensorOpIteration<ElemType, N, OPFN, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                // advance the pointers
+                for (size_t i = 0; i < N; i++)
+                    pointers[i] += strides[i];
+            }
+        }
+    };
+
+    template<class ElemType, size_t N, typename OPFN, int m>
+    struct TensorOpIteration<ElemType, N, OPFN, m, -1>
+    {
+        static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+        {
+            // we are at element level for the result: perform the op (there may still be reduction)
+            ElemType val = alpha * TensorOpReduction<ElemType, N, OPFN, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
+            // combine with previous value in target matrix, then write it out
+            auto * pout = pointers.back();
+            if (beta != 0)
+                val += beta * *pout;
+            *pout = val;
+            return;
+        }
+    };
+
+    // tensor operation with k+1 dimensions (-1 means scalar)
+    template<class ElemType, size_t N, typename OPFN, int k>
+    static inline void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, const OPFN & opfn,
+                                            const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                                            const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+    {
+        size_t dims = regularOpDims.size();
+        switch (dims)
+        {
+        case 2: return TensorOpIteration<ElemType, N, OPFN, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return TensorOpIteration<ElemType, N, OPFN, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 0: return TensorOpIteration<ElemType, N, OPFN, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims);
+        }
+    }
+
+    // tensor operation, generalized in number of arguments, operation already provided as a lambda
+    // This function now expands into different k.
+    template<class ElemType, typename OPFN, size_t N>
+    static inline void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
+                                        const std::array<size_t, 3> & offsets,
+                                        const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                                        const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+    {
+        for (size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
+            pointers[i] += offsets[i];
+        size_t dims = regularOpDims.size();
+        switch (dims)
+        {
+        case 4: return TensorOpWithRegularLoop<ElemType, N, OPFN, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 3: return TensorOpWithRegularLoop<ElemType, N, OPFN, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 2: return TensorOpWithRegularLoop<ElemType, N, OPFN, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return TensorOpWithRegularLoop<ElemType, N, OPFN, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 0: return TensorOpWithRegularLoop<ElemType, N, OPFN, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
+        }
+    }
+
+    // perform binary operation 'op' on a and b giving c, reinterpreting the matrices as tensors as specified by the dims and strides
+    template<class ElemType>
+    void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+                                       const std::array<size_t, 3> & offsets,
+                                       const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
+                                       const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
+    {
+        array<ElemType*, 3> pointers = { a.m_pArray, b.m_pArray, m_pArray };
+        switch (op)
+        {
+        case ElementWiseOperator::opSum:
+            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return *(pp[0]) + *(pp[1]); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        default:
+            LogicError("TensorNnaryOp: Unknown op code %d.", (int)op);
+        }
+    }
+
     // The explicit instantiation part
     template class MATH_API CPUMatrix<float>;
     template class MATH_API CPUMatrix<double>;
diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
index 83d63559b..6128204c4 100644
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@@ -334,6 +334,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         static bool AreEqual(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);
 
         static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
+
+        void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 3> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
 
         static CPUMatrix<ElemType> Ones(const size_t rows, const size_t cols);
         static CPUMatrix<ElemType> Zeros(const size_t rows, const size_t cols);
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 0bd30e22b..d49caee4e 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -4794,7 +4794,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         
     }
-        
+
     template<class ElemType>
     bool Matrix<ElemType>::HasElement(const Matrix<ElemType>& a, const ElemType value)
     {
@@ -4936,148 +4936,144 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return *this;
     }
 
-	template<class ElemType>
-	Matrix<ElemType>& Matrix<ElemType>::AssignElementProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift, size_t negnumber)
-	{
-		if (a.IsEmpty() || b.IsEmpty())
-			LogicError("AssignElementProductOfWithShiftNeg: Matrix is empty.");
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::AssignElementProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift, size_t negnumber)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            LogicError("AssignElementProductOfWithShiftNeg: Matrix is empty.");
 
-		assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
-		if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
-			InvalidArgument("The input matrix dimensions do not match.");
+        assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
+        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
+            InvalidArgument("The input matrix dimensions do not match.");
 
-		if (a.GetNumRows() != 1)
-			InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector.");
+        if (a.GetNumRows() != 1)
+            InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector.");
 
-		DecideAndMoveToRightDevice(a, b, *this);
-		if (!(a.GetMatrixType() == b.GetMatrixType()))
-			NOT_IMPLEMENTED;
+        DecideAndMoveToRightDevice(a, b, *this);
+        if (!(a.GetMatrixType() == b.GetMatrixType()))
+            NOT_IMPLEMENTED;
 
         this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
-		DISPATCH_MATRIX_ON_FLAG(this,
-			this,
-			this->m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber),
-			this->m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber),
-			NOT_IMPLEMENTED,
-			NOT_IMPLEMENTED
-			);
-		return *this;
-	}
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            this->m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber),
+            this->m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+        return *this;
+    }
 
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::AssignInnerProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise, size_t shift, size_t negnumber)
+    {
+        InnerProductWithShiftNeg(a, b, *this, isColWise, shift, negnumber);
+        return *this;
+    }
 
-	template<class ElemType>
-	Matrix<ElemType>& Matrix<ElemType>::AssignInnerProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise, size_t shift, size_t negnumber)
-	{
-		InnerProductWithShiftNeg(a, b, *this, isColWise, shift, negnumber);
-		return *this;
-	}
-	template<class ElemType>
-	void Matrix<ElemType>::InnerProductWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise, size_t shift, size_t negnumber)
-	{
-		if (a.IsEmpty() || b.IsEmpty())
-			LogicError("InnerProduct:  one of the input matrix is empty.");
+    template<class ElemType>
+    void Matrix<ElemType>::InnerProductWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise, size_t shift, size_t negnumber)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            LogicError("InnerProduct:  one of the input matrix is empty.");
 
-		DecideAndMoveToRightDevice(a, b, c);
+        DecideAndMoveToRightDevice(a, b, c);
 
-		if (a.GetMatrixType() != b.GetMatrixType())
-			NOT_IMPLEMENTED;
+        if (a.GetMatrixType() != b.GetMatrixType())
+            NOT_IMPLEMENTED;
 
         c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
-		DISPATCH_MATRIX_ON_FLAG(&c,
-			&c,
-			CPUMatrix<ElemType>::InnerProductWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, isColWise, shift, negnumber),
-			GPUMatrix<ElemType>::InnerProductWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, negnumber),
-			NOT_IMPLEMENTED,
-			NOT_IMPLEMENTED
-			);
+        DISPATCH_MATRIX_ON_FLAG(&c,
+            &c,
+            CPUMatrix<ElemType>::InnerProductWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, isColWise, shift, negnumber),
+            GPUMatrix<ElemType>::InnerProductWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, negnumber),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+    }
 
-	}
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::GetARowByIndex(const Matrix<ElemType>& a, size_t index)
+    {
+        if (a.IsEmpty())
+            LogicError("GetARowByIndex: Matrix is empty.");
 
 
-	template<class ElemType>
-	Matrix<ElemType>& Matrix<ElemType>::GetARowByIndex(const Matrix<ElemType>& a, size_t index)
-	{
-		if (a.IsEmpty())
-			LogicError("GetARowByIndex: Matrix is empty.");
-
-
-		//WARNING: a and this must have same type
-		if (!(GetMatrixType() == a.GetMatrixType()))
-			NOT_IMPLEMENTED;
+        //WARNING: a and this must have same type
+        if (!(GetMatrixType() == a.GetMatrixType()))
+            NOT_IMPLEMENTED;
 
         SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
 
-		DISPATCH_MATRIX_ON_FLAG(this,
-			this,
-			this->m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index),
-			this->m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index),
-			NOT_IMPLEMENTED,
-			NOT_IMPLEMENTED
-			);
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            this->m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index),
+            this->m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
 
-		return *this;
-	}
+        return *this;
+    }
 
-	template<class ElemType>
-	void Matrix<ElemType>::ConductRowElementMultiplyWithShift(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, size_t shift, bool bFirstmatrixfixed)
-	{
-		if (a.IsEmpty() || b.IsEmpty())
-			LogicError("InnerProduct:  one of the input matrix is empty.");
+    template<class ElemType>
+    void Matrix<ElemType>::ConductRowElementMultiplyWithShift(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, size_t shift, bool bFirstmatrixfixed)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            LogicError("InnerProduct:  one of the input matrix is empty.");
 
-		DecideAndMoveToRightDevice(a, b, c);
+        DecideAndMoveToRightDevice(a, b, c);
 
-		if (a.GetMatrixType() != b.GetMatrixType())
-			NOT_IMPLEMENTED;
+        if (a.GetMatrixType() != b.GetMatrixType())
+            NOT_IMPLEMENTED;
 
         c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
-		DISPATCH_MATRIX_ON_FLAG(&c,
-			&c,
-			CPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, shift, bFirstmatrixfixed),
-			GPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, bFirstmatrixfixed),
-			NOT_IMPLEMENTED,
-			NOT_IMPLEMENTED
-			);
+        DISPATCH_MATRIX_ON_FLAG(&c,
+            &c,
+            CPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, shift, bFirstmatrixfixed),
+            GPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, bFirstmatrixfixed),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+    }
 
-	}
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::AssignElementProductOfWithShift(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift)
+    {
+        if (a.IsEmpty() || b.IsEmpty())
+            LogicError("AssignElementProductOfWithShift: Matrix is empty.");
 
-	template<class ElemType>
-	Matrix<ElemType>& Matrix<ElemType>::AssignElementProductOfWithShift(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift)
-	{
-		if (a.IsEmpty() || b.IsEmpty())
-			LogicError("AssignElementProductOfWithShift: Matrix is empty.");
+        assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
+        if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
+            InvalidArgument("The input matrix dimensions do not match.");
 
-		assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
-		if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
-			InvalidArgument("The input matrix dimensions do not match.");
+        if (a.GetNumRows() != 1)
+            InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector.");
 
-		if (a.GetNumRows() != 1)
-			InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector.");
-
-		DecideAndMoveToRightDevice(a, b, *this);
-		if (!(a.GetMatrixType() == b.GetMatrixType()))
-			NOT_IMPLEMENTED;
+        DecideAndMoveToRightDevice(a, b, *this);
+        if (!(a.GetMatrixType() == b.GetMatrixType()))
+            NOT_IMPLEMENTED;
 
         this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
-		DISPATCH_MATRIX_ON_FLAG(this,
-			this,
-			this->m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift),
-			this->m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift),
-			NOT_IMPLEMENTED,
-			NOT_IMPLEMENTED
-			);
-		return *this;
-	}
-
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            this->m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift),
+            this->m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+        return *this;
+    }
 
     template<class ElemType>
     void Matrix<ElemType>::RCRFBackwardCompute(const Matrix<ElemType>& alpha, Matrix<ElemType>& beta,
-        Matrix<ElemType>& functionValues, const Matrix<ElemType>& lbls,
-        const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores, const int shift)
+                                               Matrix<ElemType>& functionValues, const Matrix<ElemType>& lbls,
+                                               const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores, const int shift)
     {
         DecideAndMoveToRightDevice(alpha, beta);
         functionValues._transferToDevice(alpha.GetDeviceId());
@@ -5134,55 +5130,70 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             );
     }
 
-	template<class ElemType>
-	Matrix<ElemType>& Matrix<ElemType>::DropFrame(const Matrix<ElemType>& label, const Matrix<ElemType>& gamma, const ElemType & threshhold)
-	{
-		DecideAndMoveToRightDevice(*this, label, gamma);
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::DropFrame(const Matrix<ElemType>& label, const Matrix<ElemType>& gamma, const ElemType & threshhold)
+    {
+        DecideAndMoveToRightDevice(*this, label, gamma);
 
-		if (label.GetNumCols() != gamma.GetNumCols() || label.GetNumRows() != gamma.GetNumRows())
-			LogicError("DropFrame: label matrix is not in the same size as gamm matrix.");
-		this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);
+        if (label.GetNumCols() != gamma.GetNumCols() || label.GetNumRows() != gamma.GetNumRows())
+            LogicError("DropFrame: label matrix is not in the same size as gamm matrix.");
+        this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);
 
-		DISPATCH_MATRIX_ON_FLAG(this,
-			this,
-			this->m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold),
-			this->m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold),
-			NOT_IMPLEMENTED,
-			NOT_IMPLEMENTED
-			);
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            this->m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold),
+            this->m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
 
-		return *this;
-	}
+        return *this;
+    }
 
-	/// <summary> c = alpha * (a-b)</summary>
-	/// if a, b, c  must have same dim 
-	/// <param name="alpha">Scalar</param>
-	/// <param name="a">Input matrix</param>
-	/// <param name="b">Input matrix</param>
-	/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
-	template<class ElemType>
-	Matrix<ElemType>& Matrix<ElemType>::AssignSequenceError(const ElemType hsmoothingWeight, const Matrix<ElemType>& label,
-		const Matrix<ElemType>& dnnoutput, const Matrix<ElemType>& gamma, ElemType alpha)
-	{
-		DecideAndMoveToRightDevice(label, dnnoutput, gamma);
+    /// <summary> c = alpha * (a-b)</summary>
+    /// if a, b, c  must have same dim 
+    /// <param name="alpha">Scalar</param>
+    /// <param name="a">Input matrix</param>
+    /// <param name="b">Input matrix</param>
+    /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::AssignSequenceError(const ElemType hsmoothingWeight, const Matrix<ElemType>& label,
+        const Matrix<ElemType>& dnnoutput, const Matrix<ElemType>& gamma, ElemType alpha)
+    {
+        DecideAndMoveToRightDevice(label, dnnoutput, gamma);
 
-		if (!(label.GetMatrixType() == gamma.GetMatrixType()))
-			NOT_IMPLEMENTED;
+        if (!(label.GetMatrixType() == gamma.GetMatrixType()))
+            NOT_IMPLEMENTED;
 
-		this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);
+        this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);
 
 
-		DISPATCH_MATRIX_ON_FLAG(this,
-			this,
-			this->m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha),
-			this->m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha),
-			NOT_IMPLEMENTED,
-			NOT_IMPLEMENTED
-			);
-		return *this;
-	}
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            this->m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha),
+            this->m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+        return *this;
+    }
 #pragma endregion Static BLAS Functions
 
+    template<class ElemType>
+    void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+                                    const array<size_t, 3> & offsets,
+                                    const vector<size_t> & regularOpDims,  const array<vector<ptrdiff_t>, 3> & regularStrides,
+                                    const vector<size_t> & reducingOpDims, const array<vector<ptrdiff_t>, 3> & reducingStrides)
+    {
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+            NOT_IMPLEMENTED, //m_GPUMatrix->TensorOp(beta, offsets, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+    }
+
     template class Matrix<float>; 
     template class Matrix<double>;    
 
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index f2d00cf84..0a6c488c4 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -16,11 +16,11 @@
 #include "CommonMatrix.h"
 #include <limits.h>
 #include <memory>   // for shared_ptr
+#include <array>
 
 // This class is exported from the Math.dll
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-
     enum CurrentDataLocation
     {
         NONE, CPU, GPU, BOTH
@@ -458,6 +458,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         static bool HasElement(const Matrix<ElemType>& a, const ElemType value = 0.0);
 
         static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c);
+
+        void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 3> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
     public:
         void Read(File& stream);
         void Write(File& stream) const;
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index 21fab4559..2a64f3e64 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -98,7 +98,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // flatten consecutive dimensions
         // Dimensions must be consecutive in memory, and either non-broadcasting or all-broadcasting, across all dimensions.
         // After this, as, bs, and cs no longer match the TensorShape objects.
-        fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
+        //fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
         for (size_t k = 1; k < dims; k++)
         {
             for (size_t i = 0; i < N; i++)
@@ -116,7 +116,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             opDims = TensorShape(opDims).Flatten(k).GetDims();  // (ugh)
         nope:;
         }
-        fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
+        //fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
 
         // remove singleton dimensions
         vector<bool> toDrop(dims, false);
@@ -132,7 +132,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             shapes[i] = shapes[i].DropSingletonDims(toDrop);
         opDims = TensorShape(opDims).DropSingletonDims(toDrop).GetDims();    // (ugh)
         // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed
-        fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
+        //fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
 
         // determine broadcasting; that is, set strides to 0 for 1-dimensions
         // To be more precise, we should only set actually broadcasting dimensions to 0.
@@ -141,6 +141,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         for (size_t i = 0; i < N; i++)
             shapes[i] = shapes[i].WithBroadcastStrides();
 
+        fprintf(stderr, "Op %d: %s  op  %s  ->  %s  via  %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
+
         // determine inverse broadcasting dimensions
         // Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index.
         // For regular input dims:
@@ -161,10 +163,31 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //     - map dimensions (regular) for output
         //     - save result
 
+        // separate out the inverse-broadcasting dimensions
+        // Any singleton dimension in the result tensor is inverse-broadcasting, because there must be at least one non-1 dimension
+        // in one of the inputs, otherwise the entire dimension would have been optimized away above.
+        vector<bool> isReducingDim(dims);    // true for each inverse-broadcasting dimension
+        for (size_t k = 0; k < dims; k++)
+            isReducingDim[k] = shapes.back()[k] == 1;
+
+        // form the regular (non-inverse-broadcasting) dims
+        array<vector<ptrdiff_t>, N> regularStrides;
+        for (size_t i = 0; i < N; i++)
+            regularStrides[i] = shapes[i].DropSingletonDims(isReducingDim).GetStrides();
+        auto regularOpDims = TensorShape(opDims).DropSingletonDims(isReducingDim).GetDims();    // (ugh)
+
+        // form the inverse-broadcasting dims
+        vector<bool> isRegularDim(dims);    // true for each inverse-broadcasting dimension
+        for (size_t k = 0; k < dims; k++)
+            isRegularDim[k] = !isReducingDim[k];   // (no way to do this more nicely?)
+        array<vector<ptrdiff_t>, N> reducingStrides;
+        for (size_t i = 0; i < N; i++)
+            reducingStrides[i] = shapes[i].DropSingletonDims(isRegularDim).GetStrides();
+        auto reducingOpDims = TensorShape(opDims).DropSingletonDims(isReducingDim).GetDims();    // (ugh)
+
         // now perform the operation
-        fprintf(stderr, "Op %d: %s  op  %s  ->  %s  via  %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
-        // :)
-        beta; alpha;
+        array<size_t, N> offsets = { a.GetShape().GetOffset(), b.GetShape().GetOffset(), c.GetShape().GetOffset() };
+        c.GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
     }
 
     // simple test function for testing stuff
diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h
index be037fa5b..7802f908d 100644
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@@ -43,7 +43,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // accessors
         // -------------------------------------------------------------------
 
-        const Matrix<ElemType> & GetSOB() const { return *m_sob; }
+        Matrix<ElemType> & GetSOB() const { return *m_sob; }
         const TensorShape & GetShape() const { return m_shape; }
 
         // -------------------------------------------------------------------

From 38cb2fa9ecf788285b9358595b8f5b115fffb6bd Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 17 Dec 2015 16:14:54 -0800
Subject: [PATCH 06/19] bug fix in MBLayout: We should not guard against all
 parallel sequences having a gap at a time step, as that happens in truncated
 BPTT, and it would be much more complex to fix the reader, so we allow it

---
 Source/Common/Include/Sequences.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 15484458f..2d5543cc4 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -109,14 +109,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_numTimeSteps = numTimeSteps;
             // allocate lookup tables (note: except at the start, these don't really allocate new memory most of the time)
 #if 1
-            if (m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps)   // sanity check for debugging a regression
+            if ((m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps) && m_numTimeSteps > 0)   // sanity check for debugging a regression
                 fprintf(stderr, "MBLayout::Init: Resizing m_distanceToStart from %d x %d to %d x %d\n",
                         (int)m_distanceToStart.GetNumRows(), (int)m_distanceToStart.GetNumCols(), (int)m_numParallelSequences, (int)m_numTimeSteps); // (I really want to know about actual allocations, but this is a necessary condition for them)
 #endif
             m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps);
             m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps);
-            m_distanceToNearestStart.assign(m_numTimeSteps, SIZE_MAX);
-            m_distanceToNearestEnd.assign(m_numTimeSteps, SIZE_MAX);
+            m_distanceToNearestStart.assign(m_numTimeSteps, PTRDIFF_MAX);
+            m_distanceToNearestEnd.assign(m_numTimeSteps,   PTRDIFF_MAX);
             m_timeStepHasGap.assign(m_numTimeSteps, false);
             m_columnsValidityMask.Resize(0, 0);     // invalidate
             // reset state
@@ -190,8 +190,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch.");
 
             // remember it
-#if 1
-            auto cap = m_sequences.capacity();  // some sanity check for debugging a speed regression
+#ifdef _DEBUG
+            auto cap = m_sequences.capacity();  // Some sanity check for debugging a speed regression. This should only show up during the first minibatches, and growing only.
             m_sequences.push_back(seqDesc);
             if (cap != m_sequences.capacity())
                 fprintf(stderr, "AddSequence: m_sequences was reallocated from capacity %d to %d\n", (int)cap, (int)m_sequences.capacity());
@@ -218,8 +218,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 // update the nearest sentence boundaries, minimum over all parallel sequences
                 // If 0, then we are on a boundary. If not 0, we can still test in presence of FrameRange.m_timeOffset.
-                size_t distanceToStart = (size_t)((ptrdiff_t)t - beginTime);
-                size_t distanceToEnd = endTime - 1 - t;
+                ptrdiff_t distanceToStart = (ptrdiff_t)t - beginTime;
+                ptrdiff_t distanceToEnd = (ptrdiff_t)(endTime - 1 - t);
                 m_distanceToStart(s, t) = (float)distanceToStart;
                 m_distanceToEnd(s, t) = (float)distanceToEnd;
                 // and the aggregate
@@ -355,7 +355,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // m_distanceToNearestStart = [ 0  1  2  3  4 ]
         // m_distanceToNearestEnd   = [ 2  1  0  1  0 ]
         Matrix<float> m_distanceToStart, m_distanceToEnd;                   // (s,t); value<0 stands for gap
-        vector<size_t> m_distanceToNearestStart, m_distanceToNearestEnd;    // [t]    (does not store info about gaps; consult m_timeStepHasGap[] vector instead)
+        vector<ptrdiff_t> m_distanceToNearestStart, m_distanceToNearestEnd; // [t]    (does not store info about gaps; consult m_timeStepHasGap[] vector instead)
 
         vector<bool> m_timeStepHasGap;                                      // [t] true if at least one gap in time step t
 
@@ -551,7 +551,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (s == SIZE_MAX)                      // aggregate requested
         {
             // determine flags from aggregate vectors
-            assert(m_distanceToNearestStart[t] != SIZE_MAX);    // (sanity check)
+            // Note: We allow that all parallel sequences contain gaps (m_distanceToNearestStart[t] == PTRDIFF_MAX)
+            // because that makes implementation of the reader easier for truncated BPTT (it knows too late that there are not that many frames left).
             auto distanceToStart = (ptrdiff_t)m_distanceToNearestStart[t];
             if (distanceToStart < -fr.m_timeOffset)
                 return true;

From 928da8828c2bf01f6c6b33f905ca7ff7d6ce8434 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 17 Dec 2015 17:04:55 -0800
Subject: [PATCH 07/19] first version of CPU implementation of
 TensorView::DoSumOf() working now

---
 Source/Common/Include/DataTensor.h | 11 +++---
 Source/Math/CPUMatrix.cpp          |  2 +-
 Source/Math/Matrix.h               |  2 ++
 Source/Math/TensorView.cpp         | 57 +++++++++++++++++++++---------
 4 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/Source/Common/Include/DataTensor.h b/Source/Common/Include/DataTensor.h
index 0152343d0..e661efe79 100644
--- a/Source/Common/Include/DataTensor.h
+++ b/Source/Common/Include/DataTensor.h
@@ -227,19 +227,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             result.m_strides[k] = /*result.m_dims[k - 1] *, it's 1 */ result.m_strides[k - 1];
             return result;
         }
-        TensorShape DropSingletonDims(const std::vector<bool> & toDrop) const  // flatten [k] with [k-1] if toFlatten[k] is set
+        TensorShape DropDims(const std::vector<bool> & toDrop) const  // remove dimension
         {
+            // this deletes a dimension while retaining strides
+            // This implies a slice to [0] for this dimension.
             TensorShape result = *this;
             size_t j = 0;
             for (size_t k = 0; k < size(); k++)
             {
                 if (toDrop[k])
-                {
-                    if (result.m_dims[k] != 1)
-                        LogicError("DeropSingletonDims() cannot drop non-singleton dimensions.");
-                    else
-                        continue;
-                }
+                    continue;
                 else
                 {
                     // example
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 092cd8c9e..788b6287e 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -5621,7 +5621,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                             const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
                                             const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
     {
-        size_t dims = regularOpDims.size();
+        size_t dims = reducingOpDims.size();
         switch (dims)
         {
         case 2: return TensorOpIteration<ElemType, N, OPFN, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 0a6c488c4..1f7f1330f 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -17,6 +17,7 @@
 #include <limits.h>
 #include <memory>   // for shared_ptr
 #include <array>
+#include <initializer_list>
 
 // This class is exported from the Math.dll
 namespace Microsoft { namespace MSR { namespace CNTK {
@@ -200,6 +201,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR);
         void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType *pArray, const size_t matrixFlags = matrixFlagNormal);
         void SetValue(const size_t rIdx, const size_t cIdx, ElemType val);  // set matrix sparsely
+        void SetValue(const size_t numRows, const size_t numCols, std::initializer_list<ElemType> l) { std::vector<ElemType> vals(l); assert(vals.size() == numRows * numCols); SetValue(numRows, numCols, GetDeviceId(), vals.data(), matrixFormatRowMajor); } // SetValue(2,3, {1,2,3,  4,5,6});
         static ElemType MakeNan(size_t payload);
         void Invalidate() { SetValue(MakeNan(__LINE__)); }
         void SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE *h_CSCCol, const CPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val,
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index 2a64f3e64..f91e11899 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -129,8 +129,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         neither:;
         }
         for (size_t i = 0; i < N; i++)
-            shapes[i] = shapes[i].DropSingletonDims(toDrop);
-        opDims = TensorShape(opDims).DropSingletonDims(toDrop).GetDims();    // (ugh)
+            shapes[i] = shapes[i].DropDims(toDrop);
+        opDims = TensorShape(opDims).DropDims(toDrop).GetDims();    // (ugh)
         // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed
         //fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
 
@@ -173,8 +173,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // form the regular (non-inverse-broadcasting) dims
         array<vector<ptrdiff_t>, N> regularStrides;
         for (size_t i = 0; i < N; i++)
-            regularStrides[i] = shapes[i].DropSingletonDims(isReducingDim).GetStrides();
-        auto regularOpDims = TensorShape(opDims).DropSingletonDims(isReducingDim).GetDims();    // (ugh)
+            regularStrides[i] = shapes[i].DropDims(isReducingDim).GetStrides();
+        auto regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims();    // (ugh)
 
         // form the inverse-broadcasting dims
         vector<bool> isRegularDim(dims);    // true for each inverse-broadcasting dimension
@@ -182,8 +182,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             isRegularDim[k] = !isReducingDim[k];   // (no way to do this more nicely?)
         array<vector<ptrdiff_t>, N> reducingStrides;
         for (size_t i = 0; i < N; i++)
-            reducingStrides[i] = shapes[i].DropSingletonDims(isRegularDim).GetStrides();
-        auto reducingOpDims = TensorShape(opDims).DropSingletonDims(isReducingDim).GetDims();    // (ugh)
+            reducingStrides[i] = shapes[i].DropDims(isRegularDim).GetStrides();
+        auto reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims();    // (ugh)
 
         // now perform the operation
         array<size_t, N> offsets = { a.GetShape().GetOffset(), b.GetShape().GetOffset(), c.GetShape().GetOffset() };
@@ -195,16 +195,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     /*static*/ void TensorView<ElemType>::Test()
     {
-        Matrix<ElemType> m1(-1); m1.Resize(1, 42);
-        Matrix<ElemType> m2(-1); m2.Resize(13, 1);
-        Matrix<ElemType> m3(-1); m3.Resize(13, 21);
-        TensorShape s1(1, 2, 21);
-        TensorShape s2(13, 1);
-        TensorShape s3(13, 1, 21);
-        let t1 = TensorView<ElemType>(m1, s1); t1;
-        let t2 = TensorView<ElemType>(m2, s2); t2;
-        auto t3 = TensorView<ElemType>(m3, s3); t3;
-        t3.DoSumOf(0, t1, t2, 1);
+        Matrix<ElemType> m1(-1);
+        Matrix<ElemType> m2(-1);
+        Matrix<ElemType> m3(-1);
+        {
+            m1.SetValue(2, 3, { 1, 2, 3,
+                                4, 5, 6 });
+            m2.SetValue(2, 1, { 13,
+                                42 });
+            m3.Resize(2, 3);
+            TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+        }
+        {
+            m3.Resize(2, 1);
+            TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+        }
+        {
+            m3.Resize(1, 3);
+            TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+        }
+        {
+            m1.Resize(1, 42);
+            m2.Resize(13, 1);
+            m3.Resize(13, 21);
+            TensorShape s1(1, 2, 21);
+            TensorShape s2(13, 1);
+            TensorShape s3(13, 1, 21);
+            let t1 = TensorView<ElemType>(m1, s1); t1;
+            let t2 = TensorView<ElemType>(m2, s2); t2;
+            auto t3 = TensorView<ElemType>(m3, s3); t3;
+            t3.DoSumOf(0, t1, t2, 1);
+            m3.Print();
+        }
     }
 
     template class TensorView<float>;

From 83e5bbc3f538b7c99998a743273497da319ddd14 Mon Sep 17 00:00:00 2001
From: Qiwei Ye <qiwye@microsoft.com>
Date: Fri, 18 Dec 2015 12:38:39 +0800
Subject: [PATCH 08/19] Revert "Revert "adding an MPI init test in case of that
 MPI was initialized repeatedly""

This reverts commit 23ebe452a5e35dddfba2d08e8fb3265901bfc8af.
---
 Source/Common/Include/MPIWrapper.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Source/Common/Include/MPIWrapper.h b/Source/Common/Include/MPIWrapper.h
index 781fab023..1ffb16c92 100644
--- a/Source/Common/Include/MPIWrapper.h
+++ b/Source/Common/Include/MPIWrapper.h
@@ -112,7 +112,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             fprintf(stderr, "MPIWrapper: initializing MPI\n"); 
             fflush(stderr);
 
-            MPI_Init_DL() || MpiFail("mpiaggregator: MPI_Init");
+			int flag = 0;
+			MPI_Initialized(&flag);
+			if (!flag)
+			{
+				MPI_Init_DL() || MpiFail("mpiaggregator: MPI_Init");
+			}
             MPI_Comm_rank(MPI_COMM_WORLD, &m_myRank);
             MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes);
             m_numNodesInUse = m_numMPINodes;

From 7d32cdfd1abc9d6186a49c8591f6353140094a43 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 17 Dec 2015 22:41:19 -0800
Subject: [PATCH 09/19] implemented all binary tensor operators (don't we love
 macros!)

---
 Source/Math/CPUMatrix.cpp  | 51 +++++++++++++++++++++++---------------
 Source/Math/CommonMatrix.h | 11 ++++++++
 Source/Math/TensorView.cpp | 25 +++++++++++++------
 Source/Math/TensorView.h   |  7 +++++-
 4 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 788b6287e..c7db6e43a 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -5256,27 +5256,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return *this;
     }
 
-
 #pragma endregion Static BLAS Functions
 
-    double logadd(double x, double y)
+    template<typename ElemType>
+    ElemType logadd_(ElemType x, ElemType y)
     {
-        double temp, diff, z; 
-    
-        if (x < y) {
-            temp = x; x = y; y = temp;
-        }
-        diff = y - x; 
-        if (diff < MINLOGEXP)
+        if (x < y)
         {
-            return (x < LSMALL)?LZERO:x;
+            ElemType temp = x; x = y; y = temp;
+        }
+        ElemType diff = y - x; 
+        if (diff < (ElemType)MINLOGEXP)
+        {
+            return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x;
         }
         else
         {
-            z = exp(diff);
-            return x + log(1.0 + z);
+            ElemType z = exp_(diff);
+            return x + log_((ElemType)1.0 + z);
         }
     }
+    double logadd(double x, double y) { return logadd_(x, y); }
 
     template<class ElemType>
     ElemType CPUMatrix<ElemType>::LogAddSumOfElements() const
@@ -5546,8 +5546,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
                                     const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
         {
-            array<ptrdiff_t, N> strides;
-            for (size_t i = 0; i < N; i++)      // N = a small constant, this will be unrolled
+            array<ptrdiff_t, N - 1> strides;        // N-1 because last one is the result pointer, which is unused in reduction
+            for (size_t i = 0; i < N - 1; i++)      // N = a small constant, this will be unrolled
                 strides[i] = reducingStrides[i][(size_t)m];
             ElemType aggregate = 0;
             for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;)
@@ -5555,8 +5555,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 // need to descend into one loop deeper
                 aggregate += TensorOpReduction<ElemType, N, OPFN, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
                 // advance the pointers
-                for (size_t i = 0; i < N; i++)
-                    pointers[i] += strides[i];
+                for (size_t i = 0; i < N - 1; i++)
+                    pointers[i] += strides[i];      // note: last pointer (result) is unused and untouched here
             }
             return aggregate;
         }
@@ -5653,7 +5653,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    // define a static function for every operation
+#define DefOp(op, expr) template<class ElemType> static inline ElemType Op ## op(ElemType a, ElemType b) { return expr; }
+
+    DefOp(Sum, a + b); DefOp(Difference, a - b); DefOp(ElementWiseProduct, a*b); DefOp(ElementWiseQuotient, a / b);
+    DefOp(LogSum, logadd_(a, b)); DefOp(Max, a > b ? a : b); DefOp(Min, a < b ? a : b);
+    DefOp(EQ, a == b); DefOp(NE, a != b); DefOp(GT, a > b); DefOp(LT, a < b); DefOp(GE, a >= b); DefOp(LE, a <= b);
+
     // perform binary operation 'op' on a and b giving c, reinterpreting the matrices as tensors as specified by the dims and strides
+    // This maps 'op' to a lambda.
     template<class ElemType>
     void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
                                        const std::array<size_t, 3> & offsets,
@@ -5661,12 +5669,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                        const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
     {
         array<ElemType*, 3> pointers = { a.m_pArray, b.m_pArray, m_pArray };
+#define CaseBinaryTensorOp(oper) \
+        case ElementWiseOperator::op ## oper: \
+            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return Op ## oper(*(pp[0]), *(pp[1])); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
         switch (op)
         {
-        case ElementWiseOperator::opSum:
-            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return *(pp[0]) + *(pp[1]); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        default:
-            LogicError("TensorNnaryOp: Unknown op code %d.", (int)op);
+        CaseBinaryTensorOp(Sum); CaseBinaryTensorOp(Difference); CaseBinaryTensorOp(ElementWiseProduct); CaseBinaryTensorOp(ElementWiseQuotient);
+        CaseBinaryTensorOp(LogSum); CaseBinaryTensorOp(Max); CaseBinaryTensorOp(Min);
+        CaseBinaryTensorOp(EQ); CaseBinaryTensorOp(NE); CaseBinaryTensorOp(GT); CaseBinaryTensorOp(LT); CaseBinaryTensorOp(GE); CaseBinaryTensorOp(LE);
+        default: LogicError("TensorNnaryOp: Unknown op code %d.", (int)op);
         }
     }
 
diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h
index afd5d7d62..1a9762233 100644
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@@ -53,12 +53,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         opLogSum, opMax, opMin,
         opEQ, opNE, opGT, opLT, opGE, opLE,
         // unary (or binary with constant parameter)
+        opCopy,
         opNegate, opNot,
         opSaturate, opAbs,
+        opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
         opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine
         // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
     };
 
+    // declare float and double versions of a func under f_
+    // e.g. exp_ -> exp(double), expf(float)
+#define OverloadUnaryMathFns(func) \
+    static inline float func ## _(float arg) { return func ## f(arg); } \
+    static inline double func ## _(double arg) { return func(arg); }
+
+    OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
+    OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
+
     // -----------------------------------------------------------------------
     // various enums to describe 
     // -----------------------------------------------------------------------
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index f91e11899..e2c3cac27 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -200,22 +200,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         Matrix<ElemType> m3(-1);
         {
             m1.SetValue(2, 3, { 1, 2, 3,
-                                4, 5, 6 });
-            m2.SetValue(2, 1, { 13,
-                                42 });
+                                14, 15, 6 });
+            m2.SetValue(2, 1, { 42,
+                                13 });
+
+            // broadcasting of an input
             m3.Resize(2, 3);
             TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
             m3.Print();
-        }
-        {
+
+            TensorView(m3).DoMaxOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+
+            TensorView(m3).DoGTOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
+
+            // reduction over columns
             m3.Resize(2, 1);
             TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
             m3.Print();
-        }
-        {
+
+            // reduction over rows
             m3.Resize(1, 3);
             TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
             m3.Print();
+
+            TensorView(m3).DoLogSumOf(0, TensorView(m1), TensorView(m2), 1);
+            m3.Print();
         }
         {
             m1.Resize(1, 42);
diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h
index 7802f908d..d3b3eef02 100644
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@@ -56,7 +56,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
         // -------------------------------------------------------------------
 
-        void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::opSum); }
+#define DeclareBinaryTensorOp(oper) \
+        void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); }
+
+        DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient);
+        DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min);
+        DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE);
 
         static void Test();
 

From f54e1feaaa811159f0aa778e2e8d45810a4cd649 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Dec 2015 00:07:59 -0800
Subject: [PATCH 10/19] implemented unary and ternary tensor ops. CPU
 implementation of elementwise tensor ops is feature complete (but may require
 optimization)

---
 Source/Math/CPUMatrix.cpp  | 102 ++++++++++++++++++++++++++++++++++---
 Source/Math/CPUMatrix.h    |   8 +++
 Source/Math/CommonMatrix.h |  16 +++---
 Source/Math/Matrix.cpp     |  30 +++++++++++
 Source/Math/Matrix.h       |  10 +++-
 Source/Math/TensorView.cpp |  71 +++++++++++++++++++-------
 Source/Math/TensorView.h   |  16 ++++++
 7 files changed, 220 insertions(+), 33 deletions(-)

diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index c7db6e43a..2718ae8e8 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -5582,6 +5582,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                 const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
                                 const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
         {
+            // TODO: if leading dim is all-ones, we can hard-code the loop and hope the compiler vectorizes for us
             // non-scalar case: still nested result loops left
             array<ptrdiff_t, N> strides;
             for (size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
@@ -5635,7 +5636,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // This function now expands into different k.
     template<class ElemType, typename OPFN, size_t N>
     static inline void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
-                                        const std::array<size_t, 3> & offsets,
+                                        const std::array<size_t, N> & offsets,
                                         const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
                                         const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
     {
@@ -5653,14 +5654,75 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    template<class ElemType>
+    static inline ElemType Sigmoid(ElemType z)
+    {
+        if (z >= 0)
+            return 1 / (1 + exp_(-z));
+        else
+        {
+            ElemType v = exp_(z);
+            return v / (1 + v);
+        }
+    }
+    template<class ElemType>
+    static inline ElemType SigmoidDerivative(ElemType z)
+    {
+        ElemType v = Sigmoid(z);
+        return v * (1 - v);
+    }
+    template<class ElemType>
+    static inline ElemType LinearRectifierDerivative(ElemType z)
+    {
+        return z > 0 ? (ElemType)1 : 0;
+    }
+    template<class ElemType>
+    static inline ElemType Sqrt(ElemType z)
+    {
+        return sqrt_(max(0, z));
+    }
+
     // define a static function for every operation
-#define DefOp(op, expr) template<class ElemType> static inline ElemType Op ## op(ElemType a, ElemType b) { return expr; }
+#define DefUnaryOp(op, expr) template<class ElemType> static inline ElemType Op ## op(ElemType a) { return expr; }
 
-    DefOp(Sum, a + b); DefOp(Difference, a - b); DefOp(ElementWiseProduct, a*b); DefOp(ElementWiseQuotient, a / b);
-    DefOp(LogSum, logadd_(a, b)); DefOp(Max, a > b ? a : b); DefOp(Min, a < b ? a : b);
-    DefOp(EQ, a == b); DefOp(NE, a != b); DefOp(GT, a > b); DefOp(LT, a < b); DefOp(GE, a >= b); DefOp(LE, a <= b);
+    DefUnaryOp(Copy, a);
+    DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
+    DefUnaryOp(Abs, fabs_(a));
+    DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
+    //DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
 
-    // perform binary operation 'op' on a and b giving c, reinterpreting the matrices as tensors as specified by the dims and strides
+    // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+    // This maps 'op' to a lambda.
+    template<class ElemType>
+    void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+                                       const std::array<size_t, 2> & offsets,
+                                       const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
+                                       const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
+    {
+        array<ElemType*, 2> pointers = { a.m_pArray, m_pArray };
+#define CaseUnaryTensorOp(oper) \
+        case ElementWiseOperator::op ## oper: \
+            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2> & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+        switch (op)
+        {
+        CaseUnaryTensorOp(Copy);
+        CaseUnaryTensorOp(Negate); CaseUnaryTensorOp(Not);
+        CaseUnaryTensorOp(Abs);
+        CaseUnaryTensorOp(Sigmoid); CaseUnaryTensorOp(SigmoidDerivative); CaseUnaryTensorOp(Tanh); CaseUnaryTensorOp(Sqrt); CaseUnaryTensorOp(Exp); CaseUnaryTensorOp(Log); CaseUnaryTensorOp(LinearRectifierDerivative); CaseUnaryTensorOp(Cosine); CaseUnaryTensorOp(NegativeSine);
+        // functions with lambda arguments--these are different
+        //CaseUnaryTensorOp(SaturateBetaAlpha); CaseUnaryTensorOp(SumAlpha); CaseUnaryTensorOp(SubDifferenceToAlpha); CaseUnaryTensorOp(SubDifferenceFromAlpha);
+        default: LogicError("TensorUnaryOp: Unknown op code %d.", (int)op);
+        }
+    }
+
+    // define a static function for every operation
+#define DefBinaryOp(op, expr) template<class ElemType> static inline ElemType Op ## op(ElemType a, ElemType b) { return expr; }
+
+    DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b);
+    DefBinaryOp(LogSum, logadd_(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
+    DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
+
+    // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
     // This maps 'op' to a lambda.
     template<class ElemType>
     void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
@@ -5671,13 +5733,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         array<ElemType*, 3> pointers = { a.m_pArray, b.m_pArray, m_pArray };
 #define CaseBinaryTensorOp(oper) \
         case ElementWiseOperator::op ## oper: \
-            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return Op ## oper(*(pp[0]), *(pp[1])); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
         switch (op)
         {
         CaseBinaryTensorOp(Sum); CaseBinaryTensorOp(Difference); CaseBinaryTensorOp(ElementWiseProduct); CaseBinaryTensorOp(ElementWiseQuotient);
         CaseBinaryTensorOp(LogSum); CaseBinaryTensorOp(Max); CaseBinaryTensorOp(Min);
         CaseBinaryTensorOp(EQ); CaseBinaryTensorOp(NE); CaseBinaryTensorOp(GT); CaseBinaryTensorOp(LT); CaseBinaryTensorOp(GE); CaseBinaryTensorOp(LE);
-        default: LogicError("TensorNnaryOp: Unknown op code %d.", (int)op);
+        default: LogicError("TensorBinaryOp: Unknown op code %d.", (int)op);
+        }
+    }
+
+    // define a static function for every operation
+#define DefTernaryOp(op, expr) template<class ElemType> static inline ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }
+
+    DefTernaryOp(Cond, a ? b : c);
+
+    // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+    // This maps 'op' to a lambda.
+    template<class ElemType>
+    void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+                                       const std::array<size_t, 4> & offsets,
+                                       const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
+                                       const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides)
+    {
+        array<ElemType*, 4> pointers = { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray };
+#define CaseTernaryTensorOp(oper) \
+        case ElementWiseOperator::op ## oper: \
+            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4> & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+        switch (op)
+        {
+        CaseTernaryTensorOp(Cond);
+        default: LogicError("TensorTernaryOp: Unknown op code %d.", (int)op);
         }
     }
 
diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
index 6128204c4..18b4dd5fa 100644
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@@ -335,10 +335,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
 
+        void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 2> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides);
         void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
                       const std::array<size_t, 3> & offsets,
                       const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
                       const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
+        void TensorOp(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 4> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides);
 
         static CPUMatrix<ElemType> Ones(const size_t rows, const size_t cols);
         static CPUMatrix<ElemType> Zeros(const size_t rows, const size_t cols);
diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h
index 1a9762233..8bae0cfd5 100644
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@@ -48,16 +48,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     enum ElementWiseOperator
     {
+        // unary (or binary with constant parameter)
+        opCopy,
+        opNegate, opNot,
+        opAbs,
+        opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine,
+        // these are not implemented yet:
+        opSaturateBetaAlpha, opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
         // binary
         opSum, opDifference, opElementWiseProduct, opElementWiseQuotient,
         opLogSum, opMax, opMin,
         opEQ, opNE, opGT, opLT, opGE, opLE,
-        // unary (or binary with constant parameter)
-        opCopy,
-        opNegate, opNot,
-        opSaturate, opAbs,
-        opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha,
-        opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine
+        // ternary
+        opCond
         // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
     };
 
@@ -67,6 +70,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     static inline float func ## _(float arg) { return func ## f(arg); } \
     static inline double func ## _(double arg) { return func(arg); }
 
+    OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
     OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
     OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
 
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index d49caee4e..f265413cd 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -5179,6 +5179,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 #pragma endregion Static BLAS Functions
 
+    template<class ElemType>
+    void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+                                    const array<size_t, 2> & offsets,
+                                    const vector<size_t> & regularOpDims,  const array<vector<ptrdiff_t>, 2> & regularStrides,
+                                    const vector<size_t> & reducingOpDims, const array<vector<ptrdiff_t>, 2> & reducingStrides)
+    {
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+            NOT_IMPLEMENTED, //m_GPUMatrix->TensorOp(beta, offsets, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+    }
+
     template<class ElemType>
     void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
                                     const array<size_t, 3> & offsets,
@@ -5194,6 +5209,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             );
     }
 
+    template<class ElemType>
+    void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+                                    const array<size_t, 4> & offsets,
+                                    const vector<size_t> & regularOpDims,  const array<vector<ptrdiff_t>, 4> & regularStrides,
+                                    const vector<size_t> & reducingOpDims, const array<vector<ptrdiff_t>, 4> & reducingStrides)
+    {
+        DISPATCH_MATRIX_ON_FLAG(this,
+            this,
+            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+            NOT_IMPLEMENTED, //m_GPUMatrix->TensorOp(beta, offsets, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+    }
+
     template class Matrix<float>; 
     template class Matrix<double>;    
 
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 1f7f1330f..b1a2aa9fa 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -378,7 +378,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const;
         void VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
 
-        Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false); 
+        Matrix<ElemType>& AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false); 
 
         Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); //this method will resize(1,1) first
 
@@ -461,10 +461,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c);
 
+        void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 2> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides);
         void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
                       const std::array<size_t, 3> & offsets,
                       const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
                       const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides);
+        void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
+                      const std::array<size_t, 4> & offsets,
+                      const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
+                      const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides);
     public:
         void Read(File& stream);
         void Write(File& stream) const;
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index e2c3cac27..0676b014d 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -57,17 +57,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     static bool Matches(size_t d1, size_t d2) { return d1 == 1 || d2 == 1 || d1 == d2; }    // do two dimensions match?
 
-    template<class ElemType>
-    void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
+    template<class ElemType, size_t N>
+    static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N> & offsets,
+                                      vector<size_t> & regularOpDims,
+                                      array<vector<ptrdiff_t>, N> & regularStrides,
+                                      vector<size_t> & reducingOpDims,
+                                      array<vector<ptrdiff_t>, N> & reducingStrides)
     {
-#define N 3     // later make this a template parameter. N=1 is possible for generators, such as constants.
-        array<TensorShape, N> shapes;
-        TensorView & c = *this;
-
-        shapes[0] = a.GetShape();
-        shapes[1] = b.GetShape();
-        shapes[2] = c.GetShape();       // last one is the output
-
         // massage TensorShapes
         // Note that TensorShapes here may be shapes are stored or shapes with stride magic applied.
 
@@ -131,6 +127,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         for (size_t i = 0; i < N; i++)
             shapes[i] = shapes[i].DropDims(toDrop);
         opDims = TensorShape(opDims).DropDims(toDrop).GetDims();    // (ugh)
+        dims = opDims.size();   // #dims has changed
+        for (size_t i = 0; i < N; i++)
+            assert(dims == shapes[i].size());
         // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed
         //fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
 
@@ -141,7 +140,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         for (size_t i = 0; i < N; i++)
             shapes[i] = shapes[i].WithBroadcastStrides();
 
-        fprintf(stderr, "Op %d: %s  op  %s  ->  %s  via  %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
+        //fprintf(stderr, "%s  op  %s  ->  %s  via  %s\n", string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str());
 
         // determine inverse broadcasting dimensions
         // Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index.
@@ -171,23 +170,55 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             isReducingDim[k] = shapes.back()[k] == 1;
 
         // form the regular (non-inverse-broadcasting) dims
-        array<vector<ptrdiff_t>, N> regularStrides;
         for (size_t i = 0; i < N; i++)
             regularStrides[i] = shapes[i].DropDims(isReducingDim).GetStrides();
-        auto regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims();    // (ugh)
+        regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims();    // (ugh)
 
         // form the inverse-broadcasting dims
         vector<bool> isRegularDim(dims);    // true for each inverse-broadcasting dimension
         for (size_t k = 0; k < dims; k++)
             isRegularDim[k] = !isReducingDim[k];   // (no way to do this more nicely?)
-        array<vector<ptrdiff_t>, N> reducingStrides;
         for (size_t i = 0; i < N; i++)
             reducingStrides[i] = shapes[i].DropDims(isRegularDim).GetStrides();
-        auto reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims();    // (ugh)
+        reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims();    // (ugh)
+
+        for (size_t i = 0; i < N; i++)
+            offsets[i] = shapes[i].GetOffset();
+    }
+
+    template<class ElemType>
+    void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
+    {
+        // prepare all tensor descriptor information as needed for execution
+        array<size_t, 2> offsets;
+        array<vector<ptrdiff_t>, 2> regularStrides, reducingStrides;
+        vector<size_t> regularOpDims, reducingOpDims;
+        PrepareTensorOperands<ElemType,2>(array<TensorShape, 2> { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 
         // now perform the operation
-        array<size_t, N> offsets = { a.GetShape().GetOffset(), b.GetShape().GetOffset(), c.GetShape().GetOffset() };
-        c.GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    }
+
+    template<class ElemType>
+    void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
+    {
+        array<size_t, 3> offsets;
+        array<vector<ptrdiff_t>, 3> regularStrides, reducingStrides;
+        vector<size_t> regularOpDims, reducingOpDims;
+        PrepareTensorOperands<ElemType, 3>(array<TensorShape, 3> { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+
+        GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    }
+
+    template<class ElemType>
+    void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op)
+    {
+        array<size_t, 4> offsets;
+        array<vector<ptrdiff_t>, 4> regularStrides, reducingStrides;
+        vector<size_t> regularOpDims, reducingOpDims;
+        PrepareTensorOperands<ElemType, 4>(array<TensorShape, 4> { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+
+        GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
     }
 
     // simple test function for testing stuff
@@ -204,8 +235,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m2.SetValue(2, 1, { 42,
                                 13 });
 
-            // broadcasting of an input
+            // unary ops
             m3.Resize(2, 3);
+            TensorView(m3).DoSqrtOf(0, TensorView(m1), 1);
+            m3.Print();
+
+            // broadcasting of an input
             TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
             m3.Print();
 
diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h
index d3b3eef02..1ceb0332a 100644
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@@ -56,6 +56,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
         // -------------------------------------------------------------------
 
+#define DeclareUnaryTensorOp(oper) \
+        void Do ## oper ## Of(ElemType beta, const TensorView & a, ElemType alpha) { DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op ## oper); }
+
+        DeclareUnaryTensorOp(Copy);
+        DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not);
+        DeclareUnaryTensorOp(Abs);
+        DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine);
+        DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha);
+
 #define DeclareBinaryTensorOp(oper) \
         void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); }
 
@@ -63,11 +72,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min);
         DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE);
 
+#define DeclareTernaryTensorOp(oper) \
+        void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha) { DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op ## oper); }
+
+        DeclareTernaryTensorOp(Cond);
+
         static void Test();
 
     private:
 
+        void DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op);
         void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op);
+        void DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op);
 
         // -------------------------------------------------------------------
         // sob members

From 679c3c52984b1c5854d8003e2950501eeaaaee4a Mon Sep 17 00:00:00 2001
From: Mark Hillebrand <mahilleb@microsoft.com>
Date: Tue, 15 Dec 2015 12:39:43 +0000
Subject: [PATCH 11/19] Source/Readers/LMSequenceReader/: also build
 SequenceWriter on Linux

---
 Makefile                                      |  1 +
 .../LMSequenceReader/SequenceWriter.cpp       |  4 +--
 .../Readers/LMSequenceReader/SequenceWriter.h | 36 ++++++++++---------
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index 8c01ff667..fbff1ad0e 100644
--- a/Makefile
+++ b/Makefile
@@ -309,6 +309,7 @@ LMSEQUENCEREADER_SRC =\
 	$(SOURCEDIR)/Readers/LMSequenceReader/Exports.cpp \
 	$(SOURCEDIR)/Readers/LMSequenceReader/SequenceParser.cpp \
 	$(SOURCEDIR)/Readers/LMSequenceReader/SequenceReader.cpp \
+	$(SOURCEDIR)/Readers/LMSequenceReader/SequenceWriter.cpp \
 
 LMSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LMSEQUENCEREADER_SRC))
 
diff --git a/Source/Readers/LMSequenceReader/SequenceWriter.cpp b/Source/Readers/LMSequenceReader/SequenceWriter.cpp
index 915052c72..5a74afd98 100644
--- a/Source/Readers/LMSequenceReader/SequenceWriter.cpp
+++ b/Source/Readers/LMSequenceReader/SequenceWriter.cpp
@@ -4,10 +4,10 @@
 // </copyright>
 //
 
-//
-
 #include "stdafx.h"
+#ifdef _WIN32
 #include <objbase.h>
+#endif
 #include "Basics.h"
 #include <fstream>
 #include <algorithm>
diff --git a/Source/Readers/LMSequenceReader/SequenceWriter.h b/Source/Readers/LMSequenceReader/SequenceWriter.h
index 99eec4da3..06ab0fd3f 100644
--- a/Source/Readers/LMSequenceReader/SequenceWriter.h
+++ b/Source/Readers/LMSequenceReader/SequenceWriter.h
@@ -12,21 +12,6 @@
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-    template<class ElemType>
-    void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
-    {
-        *pwriter = new LMSequenceWriter<ElemType>();
-    }
-
-    extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
-    {
-        GetWriter(pwriter);
-    }
-    extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
-    {
-        GetWriter(pwriter);
-    }
-
     template<class ElemType>
     class LMSequenceWriter : public IDataWriter<ElemType>
     {
@@ -65,8 +50,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
     public:
+        using LabelType = typename IDataWriter<ElemType>::LabelType;
+        using LabelIdType = typename IDataWriter<ElemType>::LabelIdType;
         void GetSections(std::map<std::wstring, SectionType, nocase_compare>& /*sections*/){}
-        void SaveMapping(std::wstring saveId, const std::map<typename LabelIdType, typename LabelType>& /*labelMapping*/){}
+        void SaveMapping(std::wstring saveId, const std::map<LabelIdType, LabelType>& /*labelMapping*/){}
 
     public:
         template<class ConfigRecordType>
@@ -77,4 +64,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual bool SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized);
     };
 
+    template<class ElemType>
+    void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
+    {
+        assert(pwriter != nullptr);
+        *pwriter = new LMSequenceWriter<ElemType>();
+        assert(*pwriter != nullptr);
+    }
+
+    extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
+    {
+        GetWriter(pwriter);
+    }
+    extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
+    {
+        GetWriter(pwriter);
+    }
+
 }}}

From 91eadb058777589aba4a2ed11bc3530068a272c1 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Dec 2015 08:54:19 -0800
Subject: [PATCH 12/19] moved all tensor ops to a new header TensorOps.h so
 they can be shared between matrix types; also moved the float/double-unified
 math overloads (e.g. exp_()) there, as well as additional typically needed
 functions such as Sigmoid()

---
 .../lyx/CNTKBook_CN_Chapter.lyx               |   2 +-
 Source/Math/CPUMatrix.cpp                     | 128 +++++------------
 Source/Math/CommonMatrix.h                    |  24 ++--
 Source/Math/Math.vcxproj                      |   1 +
 Source/Math/Math.vcxproj.filters              |   3 +
 Source/Math/Matrix.cpp                        |   2 +
 Source/Math/TensorOps.h                       | 132 ++++++++++++++++++
 Source/Math/TensorView.h                      |  29 ++--
 8 files changed, 206 insertions(+), 115 deletions(-)
 create mode 100644 Source/Math/TensorOps.h

diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
index 2563ad515..8e9a5c845 100644
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
@@ -9154,7 +9154,7 @@ L
 \begin_layout Standard
 \begin_inset Formula 
 \begin{eqnarray}
-\alpha_{t}\left(i\right) & \leftarrow & h_{it}+logadd_{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
+\alpha_{t}\left(i\right) & \leftarrow & h_{it}+LogAdd{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\
 \mathbf{\frac{\partial R}{\partial\delta_{t-1}(i)}} & \leftarrow & \sum_{j}\frac{\partial C_{logadd}}{\partial\delta_{t}(j)}\frac{\exp(\delta_{t-1}(i)+a_{i,j})}{\sum_{k}\exp(\delta_{t-1}(k)+a_{k,j})}\\
 \mathbf{\frac{\partial R}{\partial\delta_{T}(i)}} & \leftarrow & \frac{\exp(\delta_{T}(i))}{\sum_{k}\exp(\delta_{T}(k))}\\
 \frac{\partial R}{\partial h_{t}(i)} & \leftarrow & l_{t}(i)-\frac{\partial R}{\partial\delta_{t}(i)}\\
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 2718ae8e8..ca08faf71 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -9,12 +9,13 @@
 #include "stdafx.h"
 #include "Basics.h"
 #include "File.h"
-
+#include "CPUMatrix.h"
+#include "TensorOps.h"
 #include <assert.h>
 #include <stdexcept>
 #include <omp.h>
 #include <math.h>
-#include "CPUMatrix.h"
+
 #include <random>
 #include <chrono>
 #include <exception>
@@ -4304,7 +4305,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (sample_id == 0)
                 sample_prob = -sample_prob;
             double score_noise = log_num_noise_samples + sample_prob;
-            double z = logadd(score, score_noise);
+            double z = LogAdd(score, score_noise);
             double logprob = score - z;
             double logprob_noise = score_noise - z;
             tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob);
@@ -5258,32 +5259,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
 #pragma endregion Static BLAS Functions
 
-    template<typename ElemType>
-    ElemType logadd_(ElemType x, ElemType y)
-    {
-        if (x < y)
-        {
-            ElemType temp = x; x = y; y = temp;
-        }
-        ElemType diff = y - x; 
-        if (diff < (ElemType)MINLOGEXP)
-        {
-            return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x;
-        }
-        else
-        {
-            ElemType z = exp_(diff);
-            return x + log_((ElemType)1.0 + z);
-        }
-    }
-    double logadd(double x, double y) { return logadd_(x, y); }
+    // 'double' version of LogAdd
+    double LogAddD(double x, double y) { return LogAdd(x, y); }
 
     template<class ElemType>
     ElemType CPUMatrix<ElemType>::LogAddSumOfElements() const
     {
         ElemType fAlpha = (ElemType)LZERO;
         for (int k = 0; k < GetNumElements(); k++)
-            fAlpha = (ElemType) logadd(fAlpha, m_pArray[k]);
+            fAlpha = (ElemType) LogAddD(fAlpha, m_pArray[k]);
         return fAlpha;
     }
 
@@ -5330,7 +5314,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             fSum = (ElemType)LZERO;
             for (int j = 0; j < iNumLab; j++)
             {
-                fSum = (ElemType)logadd((double)fSum, alpha(j, t));
+                fSum = (ElemType)LogAddD(fSum, alpha(j, t));
             }
 
             fTmp = alpha(k, t) - fSum;
@@ -5343,10 +5327,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 fSum = (ElemType)LZERO;
                 for (int m = 0; m < iNumLab; m++)
                 {
-                    fSum = (ElemType)logadd((double)fSum, alpha(m, t) + pair_scores(j, m));
+                    fSum = (ElemType)LogAddD(fSum, alpha(m, t) + pair_scores(j, m));
                 }
 
-                fTmp = (ElemType)logadd(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum);
+                fTmp = (ElemType)LogAddD(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum);
             }
             beta(k, t) = fTmp;
         }
@@ -5455,7 +5439,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     else{
                         fTmp2 = a(k, 0);
                     }
-                    fSum = (ElemType)logadd(fSum, fTmp2 + pair_scores(j, k));
+                    fSum = (ElemType)LogAddD(fSum, fTmp2 + pair_scores(j, k));
                 }
 
                 fTmp -= fSum;
@@ -5537,6 +5521,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // TensorView support
     // -----------------------------------------------------------------------
 
+    // To save time, this makes extensive use of templates and macros.
+
     // perform loop over reduction index m
     // This function is declared inside a wrapper struct to allow partial specialization (m = -1).
     template<class ElemType, size_t N, typename OPFN, int m>
@@ -5654,43 +5640,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
-    template<class ElemType>
-    static inline ElemType Sigmoid(ElemType z)
-    {
-        if (z >= 0)
-            return 1 / (1 + exp_(-z));
-        else
-        {
-            ElemType v = exp_(z);
-            return v / (1 + v);
-        }
-    }
-    template<class ElemType>
-    static inline ElemType SigmoidDerivative(ElemType z)
-    {
-        ElemType v = Sigmoid(z);
-        return v * (1 - v);
-    }
-    template<class ElemType>
-    static inline ElemType LinearRectifierDerivative(ElemType z)
-    {
-        return z > 0 ? (ElemType)1 : 0;
-    }
-    template<class ElemType>
-    static inline ElemType Sqrt(ElemType z)
-    {
-        return sqrt_(max(0, z));
-    }
-
-    // define a static function for every operation
-#define DefUnaryOp(op, expr) template<class ElemType> static inline ElemType Op ## op(ElemType a) { return expr; }
-
-    DefUnaryOp(Copy, a);
-    DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
-    DefUnaryOp(Abs, fabs_(a));
-    DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
-    //DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
-
     // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
     // This maps 'op' to a lambda.
     template<class ElemType>
@@ -5699,29 +5648,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                        const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
                                        const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
     {
+        #define CaseUnaryTensorOp(oper) \
+            case ElementWiseOperator::op ## oper: \
+                return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2> & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
         array<ElemType*, 2> pointers = { a.m_pArray, m_pArray };
-#define CaseUnaryTensorOp(oper) \
-        case ElementWiseOperator::op ## oper: \
-            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2> & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
         switch (op)
         {
-        CaseUnaryTensorOp(Copy);
-        CaseUnaryTensorOp(Negate); CaseUnaryTensorOp(Not);
-        CaseUnaryTensorOp(Abs);
-        CaseUnaryTensorOp(Sigmoid); CaseUnaryTensorOp(SigmoidDerivative); CaseUnaryTensorOp(Tanh); CaseUnaryTensorOp(Sqrt); CaseUnaryTensorOp(Exp); CaseUnaryTensorOp(Log); CaseUnaryTensorOp(LinearRectifierDerivative); CaseUnaryTensorOp(Cosine); CaseUnaryTensorOp(NegativeSine);
-        // functions with lambda arguments--these are different
-        //CaseUnaryTensorOp(SaturateBetaAlpha); CaseUnaryTensorOp(SumAlpha); CaseUnaryTensorOp(SubDifferenceToAlpha); CaseUnaryTensorOp(SubDifferenceFromAlpha);
+        ForAllUnaryOps(CaseUnaryTensorOp);
         default: LogicError("TensorUnaryOp: Unknown op code %d.", (int)op);
         }
     }
 
-    // define a static function for every operation
-#define DefBinaryOp(op, expr) template<class ElemType> static inline ElemType Op ## op(ElemType a, ElemType b) { return expr; }
-
-    DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b);
-    DefBinaryOp(LogSum, logadd_(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
-    DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
-
     // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
     // This maps 'op' to a lambda.
     template<class ElemType>
@@ -5730,24 +5668,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                        const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
                                        const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
     {
+        #define CaseBinaryTensorOp(oper) \
+            case ElementWiseOperator::op ## oper: \
+                return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
         array<ElemType*, 3> pointers = { a.m_pArray, b.m_pArray, m_pArray };
-#define CaseBinaryTensorOp(oper) \
-        case ElementWiseOperator::op ## oper: \
-            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3> & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
         switch (op)
         {
-        CaseBinaryTensorOp(Sum); CaseBinaryTensorOp(Difference); CaseBinaryTensorOp(ElementWiseProduct); CaseBinaryTensorOp(ElementWiseQuotient);
-        CaseBinaryTensorOp(LogSum); CaseBinaryTensorOp(Max); CaseBinaryTensorOp(Min);
-        CaseBinaryTensorOp(EQ); CaseBinaryTensorOp(NE); CaseBinaryTensorOp(GT); CaseBinaryTensorOp(LT); CaseBinaryTensorOp(GE); CaseBinaryTensorOp(LE);
+        ForAllBinaryOps(CaseBinaryTensorOp);
         default: LogicError("TensorBinaryOp: Unknown op code %d.", (int)op);
         }
     }
 
-    // define a static function for every operation
-#define DefTernaryOp(op, expr) template<class ElemType> static inline ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }
-
-    DefTernaryOp(Cond, a ? b : c);
-
     // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
     // This maps 'op' to a lambda.
     template<class ElemType>
@@ -5756,18 +5688,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                        const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 4> & regularStrides,
                                        const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 4> & reducingStrides)
     {
+        #define CaseTernaryTensorOp(oper) \
+            case ElementWiseOperator::op ## oper: \
+                return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4> & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
         array<ElemType*, 4> pointers = { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray };
-#define CaseTernaryTensorOp(oper) \
-        case ElementWiseOperator::op ## oper: \
-            return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4> & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
         switch (op)
         {
-        CaseTernaryTensorOp(Cond);
+        ForAllTernaryOps(CaseTernaryTensorOp);
         default: LogicError("TensorTernaryOp: Unknown op code %d.", (int)op);
         }
     }
 
-    // The explicit instantiation part
+    // -----------------------------------------------------------------------
+    // explicit instantiations
+    // -----------------------------------------------------------------------
+
     template class MATH_API CPUMatrix<float>;
     template class MATH_API CPUMatrix<double>;
 
diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h
index 8bae0cfd5..8a73246c9 100644
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@@ -64,15 +64,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode.
     };
 
-    // declare float and double versions of a func under f_
-    // e.g. exp_ -> exp(double), expf(float)
-#define OverloadUnaryMathFns(func) \
-    static inline float func ## _(float arg) { return func ## f(arg); } \
-    static inline double func ## _(double arg) { return func(arg); }
+    // helper to apply a C macro for all operations of each kind
+#define ForAllUnaryOps(Macro) \
+    Macro(Copy); \
+    Macro(Negate); Macro(Not); \
+    Macro(Abs); \
+    Macro(Sigmoid); Macro(SigmoidDerivative); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifierDerivative); Macro(Cosine); Macro(NegativeSine);
 
-    OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
-    OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
-    OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
+#define ForAllParameterizedUnaryOps(Macro) \
+    Macro(SaturateBetaAlpha); Macro(SumAlpha); Macro(SubDifferenceToAlpha); Macro(SubDifferenceFromAlpha);
+
+#define ForAllBinaryOps(Macro) \
+    Macro(Sum); Macro(Difference); Macro(ElementWiseProduct); Macro(ElementWiseQuotient); \
+    Macro(LogSum); Macro(Max); Macro(Min); \
+    Macro(EQ); Macro(NE); Macro(GT); Macro(LT); Macro(GE); Macro(LE);
+
+#define ForAllTernaryOps(Macro) \
+    Macro(Cond);
 
     // -----------------------------------------------------------------------
     // various enums to describe 
diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj
index 738b59eed..42bd05ebd 100644
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@@ -162,6 +162,7 @@
     <ClInclude Include="CommonMatrix.h" />
     <ClInclude Include="ConvolutionEngine.h" />
     <ClInclude Include="CPUMatrix.h" />
+    <ClInclude Include="TensorOps.h" />
     <ClInclude Include="TensorView.h" />
     <None Include="ClassDiagram.cd" />
     <None Include="GPUWatcher.cu" />
diff --git a/Source/Math/Math.vcxproj.filters b/Source/Math/Math.vcxproj.filters
index 625886e2e..a46a3807a 100644
--- a/Source/Math/Math.vcxproj.filters
+++ b/Source/Math/Math.vcxproj.filters
@@ -70,6 +70,9 @@
     <ClInclude Include="TensorView.h">
       <Filter>Tensors</Filter>
     </ClInclude>
+    <ClInclude Include="TensorOps.h">
+      <Filter>Tensors</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="GPUMatrix.h">
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index f265413cd..66cea78ae 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -4887,6 +4887,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return x - y * floor(x / y);
     }
 
+    // TODO: use static LogAdd() as defined in TensorOps.h
+    //       Not doing this currently because that one uses ElemType for all ops, while this one uses double inside. Must compare before making this change.
     template<class ElemType>
     ElemType Matrix<ElemType>::LogAdd(ElemType x, ElemType y)
     {
diff --git a/Source/Math/TensorOps.h b/Source/Math/TensorOps.h
new file mode 100644
index 000000000..1ee9821de
--- /dev/null
+++ b/Source/Math/TensorOps.h
@@ -0,0 +1,132 @@
+//
+// <copyright file="TensorView.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+// This implements the elementwise tensor operations, including helper macros and some actual functions.
+
+#pragma once
+
+#include "Basics.h"
+#include "CommonMatrix.h"
+
+#pragma push_macro("TENSOR_OPS_DECL")
+#ifndef TENSOR_OPS_DECL     // to make these accessible to CUDA kernels, say '#define TENSOR_OPS_DECL __device__ __host__'
+#define TENSOR_OPS_DECL
+#endif
+
+#pragma push_macro("DECL")
+#define DECL static inline TENSOR_OPS_DECL
+
+// This class is exported from the Math.dll.
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // -----------------------------------------------------------------------
+    // unified overloads for float/double math functions
+    //
+    // Declare float and double versions of the functions f we need as f_(),
+    // e.g. exp_ -> exp(double), expf(float).
+    // -----------------------------------------------------------------------
+
+#pragma push_macro("OverloadUnaryMathFns")
+    #define OverloadUnaryMathFns(func) \
+        DECL float func ## _(float arg) { return func ## f(arg); } \
+        DECL double func ## _(double arg) { return func(arg); }
+
+    OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt);
+    OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log);
+    OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin);
+#pragma push_macro("OverloadUnaryMathFns")
+
+    // -----------------------------------------------------------------------
+    // additional functions that are standard in our context
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    DECL ElemType Sigmoid(ElemType z)
+    {
+        if (z >= 0)
+            return 1 / (1 + exp_(-z));
+        else
+        {
+            ElemType v = exp_(z);
+            return v / (1 + v);
+        }
+    }
+
+    template<class ElemType>
+    DECL ElemType SigmoidDerivative(ElemType z)
+    {
+        ElemType v = Sigmoid(z);
+        return v * (1 - v);
+    }
+
+    template<class ElemType>
+    DECL ElemType LinearRectifierDerivative(ElemType z)
+    {
+        return z > 0 ? (ElemType)1 : 0;
+    }
+
+    template<class ElemType>
+    DECL ElemType Sqrt(ElemType z)
+    {
+        // BUGBUG: Why clip to 0? An invalid sqrt() should show up as a NaN in the result, instead of hiding it.
+        return sqrt_(z > 0 ? z : 0);
+    }
+
+    // TODO: call this LogAdd() for consistency
+    template<typename ElemType>
+    DECL ElemType LogAdd(ElemType x, ElemType y)
+    {
+        if (x < y)
+        {
+            ElemType temp = x; x = y; y = temp;
+        }
+        ElemType diff = y - x;
+        if (diff < (ElemType)MINLOGEXP)
+        {
+            return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x;
+        }
+        else
+        {
+            ElemType z = exp_(diff);
+            return x + log_((ElemType)1.0 + z);
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // ElementWiseOperator implementations
+    //
+    // Define a static function for every ElementWiseOperator (CommonMatrix.h).
+    // -----------------------------------------------------------------------
+
+#pragma push_macro("DefUnaryOp")
+    #define DefUnaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a) { return expr; }
+
+    DefUnaryOp(Copy, a);
+    DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a);
+    DefUnaryOp(Abs, fabs_(a));
+    DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a));
+#pragma pop_macro("DefUnaryOp")
+
+    // parameterized unary ops
+    //DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha);
+
+#pragma push_macro("DefBinaryOp")
+    #define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b) { return expr; }
+
+    DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b);
+    DefBinaryOp(LogSum, LogAdd(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b);
+    DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b);
+#pragma pop_macro("DefBinaryOp")
+
+#pragma push_macro("DefTernaryOp")
+    #define DefTernaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; }
+
+    DefTernaryOp(Cond, a ? b : c);
+#pragma pop_macro("DefTernaryOp")
+
+}}}
+#pragma pop_macro("DECL")
+#pragma pop_macro("TENSOR_OPS_DECL")
diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h
index 1ceb0332a..2baaef473 100644
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@@ -4,7 +4,7 @@
 // </copyright>
 //
 
-// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor.
+// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor. [fseide]
 
 #pragma once
 
@@ -56,26 +56,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs.
         // -------------------------------------------------------------------
 
+#pragma push_macro("DeclareUnaryTensorOp")
 #define DeclareUnaryTensorOp(oper) \
         void Do ## oper ## Of(ElemType beta, const TensorView & a, ElemType alpha) { DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op ## oper); }
 
-        DeclareUnaryTensorOp(Copy);
-        DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not);
-        DeclareUnaryTensorOp(Abs);
-        DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine);
-        DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha);
+        ForAllUnaryOps(DeclareUnaryTensorOp);
+        ForAllParameterizedUnaryOps(DeclareUnaryTensorOp);
+        //DeclareUnaryTensorOp(Copy);
+        //DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not);
+        //DeclareUnaryTensorOp(Abs);
+        //DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine);
+        //DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha);
+#pragma pop_macro("DeclareUnaryTensorOp")
 
+#pragma push_macro("DeclareBinaryTensorOp")
 #define DeclareBinaryTensorOp(oper) \
         void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); }
 
-        DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient);
-        DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min);
-        DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE);
+        ForAllBinaryOps(DeclareBinaryTensorOp);
+        //DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient);
+        //DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min);
+        //DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE);
+#pragma pop_macro("DeclareBinaryTensorOp")
 
+#pragma push_macro("DeclareTernaryTensorOp")
 #define DeclareTernaryTensorOp(oper) \
         void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha) { DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op ## oper); }
 
-        DeclareTernaryTensorOp(Cond);
+        ForAllTernaryOps(DeclareTernaryTensorOp);
+#pragma pop_macro("DeclareTernaryTensorOp")
 
         static void Test();
 

From b8de2fef4bcd181eb86ada7ef7f29fa2af2fb65b Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Wed, 16 Dec 2015 18:52:17 -0800
Subject: [PATCH 13/19] Added support for distributed reading in ImageReader.

---
 Source/Readers/ImageReader/ImageReader.cpp | 23 ++++++++++++++++++----
 Source/Readers/ImageReader/ImageReader.h   | 10 +++++++++-
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/Source/Readers/ImageReader/ImageReader.cpp b/Source/Readers/ImageReader/ImageReader.cpp
index e613d5d29..2b38da4a3 100644
--- a/Source/Readers/ImageReader/ImageReader.cpp
+++ b/Source/Readers/ImageReader/ImageReader.cpp
@@ -16,6 +16,7 @@
 #include <sstream>  // TODO: this should go away once we update the parameter parsing
 #include <unordered_map>
 #include <opencv2/opencv.hpp>
+#include <omp.h>
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
@@ -400,6 +401,10 @@ void ImageReader<ElemType>::InitFromConfig(const ConfigRecordType& config)
 
     m_prefetch = config(L"prefetch", true);
 
+    int cthread = config(L"numCPUThreads", 0);
+    if (cthread > 0)
+        omp_set_num_threads(cthread);
+
     m_epochStart = 0;
     m_mbStart = 0;
 }
@@ -412,11 +417,16 @@ void ImageReader<ElemType>::Destroy()
 }
 
 template<class ElemType>
-void ImageReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
+void ImageReader<ElemType>::StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples)
 {
     assert(mbSize > 0);
+    assert(numSubsets > 0);
+    assert(subsetNum < numSubsets);
     assert(requestedEpochSamples > 0);
 
+    m_subsetNum = subsetNum;
+    m_numSubsets = numSubsets;
+
     if (m_imgListRand)
         std::shuffle(m_files.begin(), m_files.end(), m_rng);
 
@@ -505,10 +515,15 @@ size_t ImageReader<ElemType>::ReadImages()
     
     std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast<ElemType>(0));
 
+    size_t actualMBSize = mbLim - m_mbStart;
+    size_t iStart = actualMBSize * m_subsetNum / m_numSubsets;
+    size_t iLim = actualMBSize * (m_subsetNum + 1) / m_numSubsets;
+    size_t subsetSize = iLim - iStart;
+
 #pragma omp parallel for ordered schedule(dynamic)
-    for (long long i = 0; i < static_cast<long long>(mbLim - m_mbStart); i++)
+    for (long long i = 0; i < static_cast<long long>(subsetSize); i++)
     {
-        const auto& p = m_files[i + m_mbStart];
+        const auto& p = m_files[m_mbStart + iStart + i];
         cv::Mat img{ cv::imread(p.first, cv::IMREAD_COLOR) };
         if (!img.data)
             RuntimeError("Cannot read image file %s", p.first.c_str());
@@ -522,7 +537,7 @@ size_t ImageReader<ElemType>::ReadImages()
         m_labBuf[m_labDim * i + p.second] = 1;
     }
 
-    return mbLim - m_mbStart;
+    return subsetSize;
 }
 
 template class ImageReader<double>;
diff --git a/Source/Readers/ImageReader/ImageReader.h b/Source/Readers/ImageReader/ImageReader.h
index 32e5a8a07..cab5d07b7 100644
--- a/Source/Readers/ImageReader/ImageReader.h
+++ b/Source/Readers/ImageReader/ImageReader.h
@@ -39,7 +39,12 @@ public:
     virtual void Init(const ScriptableObjects::IConfigRecord & config) override { InitFromConfig(config); }
 #endif
     void Destroy() override;
-    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override;
+    bool SupportsDistributedMBRead() const { return true; }
+    void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize) override;
+    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override
+    {
+        return StartDistributedMinibatchLoop(mbSize, epoch, 0, 1, requestedEpochSamples);
+    }
     bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) override;
     bool DataEnd(EndDataType endDataType) override;
 
@@ -73,6 +78,9 @@ private:
     size_t m_epochStart;
     size_t m_mbStart;
 
+    size_t m_subsetNum;
+    size_t m_numSubsets;
+
     bool m_prefetch;
     std::future<size_t> m_mbPrefetchFut;
     std::vector<ElemType> m_featBuf;

From 1f26215616ee2524d220efa28fcb1379b03ab722 Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Thu, 17 Dec 2015 12:31:17 -0800
Subject: [PATCH 14/19] Fixed mbStart in ImageReader for distributed case.

---
 Source/Readers/ImageReader/ImageReader.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Readers/ImageReader/ImageReader.cpp b/Source/Readers/ImageReader/ImageReader.cpp
index 2b38da4a3..6b53514a4 100644
--- a/Source/Readers/ImageReader/ImageReader.cpp
+++ b/Source/Readers/ImageReader/ImageReader.cpp
@@ -467,7 +467,6 @@ bool ImageReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>
 
     m_pMBLayout->InitAsFrameMode(mbSize);
 
-    m_mbStart += mbSize;
     // It is safe to run prefetching with just one buffer as SetValue is synchronous so there will be no race.
     m_mbPrefetchFut = std::async(GetLaunchPolicy(m_prefetch), [this]() { return ReadImages(); });
 
@@ -537,6 +536,7 @@ size_t ImageReader<ElemType>::ReadImages()
         m_labBuf[m_labDim * i + p.second] = 1;
     }
 
+    m_mbStart += actualMBSize;
     return subsetSize;
 }
 

From 1a1bd17c21bd3f4bfb9881a4ef0cde1a0f3abf5f Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Dec 2015 10:01:17 -0800
Subject: [PATCH 15/19] bug fix: ComputationNode::DetermineNumCols() was an
 outdated pre-refactoring hold-over with a now incorrect validity check. Can
 just be removed, Should fix reported by user xiaoqing; removed unnecessary ad
 inconsistent use of 'this->' throughout Matrix.cpp, also fixed some bad
 indentations

---
 .../ComputationNetworkLib/ComputationNode.cpp |   3 +-
 .../ComputationNetworkLib/ComputationNode.h   |  12 -
 Source/Math/Matrix.cpp                        | 684 +++++++++---------
 Source/Math/Matrix.h                          |   1 +
 4 files changed, 344 insertions(+), 356 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index a104632f7..d6738861e 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -43,13 +43,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // all are consistent: install it
         LinkToMBLayout(pMBLayout);
     }
+
     // single input that maps its input element-wise (e.g. Sigmoid)
     void ComputationNodeBase::ValidateUnaryMap(bool isFinalValidationPass)
     {
         assert(m_inputs.size() == 1);
         ComputationNodeBase::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
-        SetDims(m_inputs[0]->GetNumRows(), DetermineNumCols(m_inputs[0]));
+        SetDims(m_inputs[0]);
         InferImageDimsFromInputs();
     }
     // binary zip operation, e.g. Plus
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 7f89060a6..a7649e4c7 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -340,18 +340,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
         // helper functions for common cases
-    private:
-        // determine number of columns from a child and/or layout
-        size_t DetermineNumCols(const ComputationNodeBasePtr & child) const
-        {
-            size_t childCols = child->GetNumCols();     // this is what the child says
-            if (!m_pMBLayout)                           // no layout: copy from child
-                return childCols;
-            size_t cols = m_pMBLayout->GetNumCols();    // layout: get it from there, but validate against child
-            if (childCols != cols)
-                RuntimeError("%ls %ls operation: Mismatch in number of columns", OperationName().c_str(), NodeName().c_str());
-            return cols;
-        }
     protected:
         void ValidateUnaryMap(bool isFinalValidationPass);
         void ValidateUnaryReduce(bool isFinalValidationPass);
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 66cea78ae..c223cdc43 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -529,7 +529,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     Matrix<ElemType>::~Matrix(void)
     {
-        this->Clear();
+        Clear();
     }
 
     template<class ElemType>
@@ -652,14 +652,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         if (GetMatrixType() == MatrixType::DENSE)
         { 
-                for (size_t i = this->GetNumCols()-1; i >= -numShift; i--)
-                {
-                    Matrix<ElemType> inp = this->ColumnSlice(i + numShift, 1);
-                    Matrix<ElemType> out = this->ColumnSlice(i, 1) ; 
-                    out = inp;
-                }
-                for (size_t i = 0; i < min(this->GetNumCols(), -numShift); i++)
-                    this->ColumnSlice(i, 1).SetValue(0);
+            for (size_t i = GetNumCols() - 1; i >= -numShift; i--)
+            {
+                Matrix<ElemType> inp = ColumnSlice(i + numShift, 1);
+                Matrix<ElemType> out = ColumnSlice(i, 1);
+                out = inp;
+            }
+            for (size_t i = 0; i < min(GetNumCols(), -numShift); i++)
+                ColumnSlice(i, 1).SetValue(0);
         }
         else if (GetMatrixType() == MatrixType::SPARSE)
         {
@@ -1029,8 +1029,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            return this->m_CPUMatrix->Get00Element(), 
-            return this->m_GPUMatrix->Get00Element(), 
+            return m_CPUMatrix->Get00Element(), 
+            return m_GPUMatrix->Get00Element(), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -1071,7 +1071,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (IsEmpty())
             LogicError("Transpose: Matrix is empty.");
 
-        Matrix<ElemType> c(this->GetNumCols(), this->GetNumRows(), (DEVICEID_TYPE)this->GetDeviceId());
+        Matrix<ElemType> c(GetNumCols(), GetNumRows(), (DEVICEID_TYPE)GetDeviceId());
         c.AssignTransposeOf(*this);
         return c;
     }
@@ -1084,10 +1084,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignTransposeOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignTransposeOf(*a.m_GPUMatrix),
+            m_CPUMatrix->AssignTransposeOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignTransposeOf(*a.m_GPUMatrix),
             NOT_IMPLEMENTED,
-            this->m_GPUSparseMatrix->AssignTransposeOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignTransposeOf(*a.m_GPUSparseMatrix)
             );
 
         return *this;
@@ -1149,8 +1149,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->MaskColumnsValue(*columnsMask.m_CPUMatrix, val),
-            this->m_GPUMatrix->MaskColumnsValue(*columnsMask.m_GPUMatrix, val),
+            m_CPUMatrix->MaskColumnsValue(*columnsMask.m_CPUMatrix, val),
+            m_GPUMatrix->MaskColumnsValue(*columnsMask.m_GPUMatrix, val),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -1164,8 +1164,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->SetColumn(colPointer,colInd), 
-            this->m_GPUMatrix->SetColumn(colPointer,colInd), 
+            m_CPUMatrix->SetColumn(colPointer,colInd), 
+            m_GPUMatrix->SetColumn(colPointer,colInd), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -1189,8 +1189,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->SetColumn(*colMat.m_CPUMatrix,colInd), 
-            this->m_GPUMatrix->SetColumn(*colMat.m_GPUMatrix, colInd),
+            m_CPUMatrix->SetColumn(*colMat.m_CPUMatrix,colInd), 
+            m_GPUMatrix->SetColumn(*colMat.m_GPUMatrix, colInd),
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED);
     }
@@ -1202,16 +1202,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (this == &deepCopyFrom)
             return;
 
-        this->m_preferredDeviceId = deepCopyFrom.m_preferredDeviceId;
+        m_preferredDeviceId = deepCopyFrom.m_preferredDeviceId;
         DecideAndMoveToRightDevice(deepCopyFrom, *this);
-        this->SwitchToMatrixType(deepCopyFrom.GetMatrixType(), format, false);
+        SwitchToMatrixType(deepCopyFrom.GetMatrixType(), format, false);
 
         DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom,
             this,
-            this->m_CPUMatrix->SetValue(*deepCopyFrom.m_CPUMatrix),
-            this->m_GPUMatrix->SetValue(*deepCopyFrom.m_GPUMatrix),
-            this->m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix),
-            this->m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix)
+            m_CPUMatrix->SetValue(*deepCopyFrom.m_CPUMatrix),
+            m_GPUMatrix->SetValue(*deepCopyFrom.m_GPUMatrix),
+            m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix),
+            m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix)
             );
     }
 
@@ -1391,8 +1391,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             nullptr,
             ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, 
             ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, 
-            if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*this->m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues),
-            if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*this->m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues)
+            if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues),
+            if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues)
             );
     }
 
@@ -1402,17 +1402,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DecideAndMoveToRightDevice(*this, gradients);
 
-        ElemType aveMultiplier = 1.0f;
-
         DISPATCH_MATRIX_ON_FLAG(&gradients,
             &gradients,
-            aveMultiplier = m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU),
-            aveMultiplier = m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU),
-            aveMultiplier = gradients.m_CPUSparseMatrix->Adagrad(*this->m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU),
-            aveMultiplier = gradients.m_GPUSparseMatrix->Adagrad(*this->m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU)
+            return m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU),
+            return m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU),
+            return gradients.m_CPUSparseMatrix->Adagrad(*m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU),
+            return gradients.m_GPUSparseMatrix->Adagrad(*m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU)
             );
-
-        return aveMultiplier;
     }
 
     template<class ElemType>
@@ -1449,17 +1445,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DecideAndMoveToRightDevice(*this, gradients);
 
-        ElemType aveMultiplier = 1.0f;
-
         DISPATCH_MATRIX_ON_FLAG(this,
             &gradients,
-            aveMultiplier = m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(CPU),
-            aveMultiplier = m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU),
+            return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(CPU),
+            return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU),
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
-
-        return aveMultiplier;
     }
 
     template<class ElemType>
@@ -1610,8 +1602,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->operator+=(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->operator+=(*a.m_GPUMatrix), 
+            m_CPUMatrix->operator+=(*a.m_CPUMatrix), 
+            m_GPUMatrix->operator+=(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED);
 
@@ -1631,7 +1623,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             DISPATCH_MATRIX_ON_FLAG(this,
                 &c,
                 c += (*this)(0,0), 
-                c += (this->m_GPUMatrix->Get00Element()),   // BUGBUG: efficiency
+                c += (m_GPUMatrix->Get00Element()),   // BUGBUG: efficiency
                 c += (*this)(0,0), 
                 NOT_IMPLEMENTED
                 );
@@ -1697,8 +1689,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), 
-            this->m_GPUMatrix->AssignRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), 
+            m_CPUMatrix->AssignRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), 
+            m_GPUMatrix->AssignRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -1717,8 +1709,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
-            this->m_GPUMatrix->AssignToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
+            m_CPUMatrix->AssignToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
+            m_GPUMatrix->AssignToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -1738,8 +1730,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AddToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
-            this->m_GPUMatrix->AddToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
+            m_CPUMatrix->AddToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
+            m_GPUMatrix->AddToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -1759,8 +1751,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AddWithRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
-            this->m_GPUMatrix->AddWithRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
+            m_CPUMatrix->AddWithRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
+            m_GPUMatrix->AddWithRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -1842,8 +1834,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignRepeatOf(*a.m_CPUMatrix, numRowRepeats, numColRepeats),
-            this->m_GPUMatrix->AssignRepeatOf(*a.m_GPUMatrix, numRowRepeats, numColRepeats),
+            m_CPUMatrix->AssignRepeatOf(*a.m_CPUMatrix, numRowRepeats, numColRepeats),
+            m_GPUMatrix->AssignRepeatOf(*a.m_GPUMatrix, numRowRepeats, numColRepeats),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -1862,8 +1854,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AddToRowRepeatValuesOf(*a.m_CPUMatrix, numRepeats),
-            this->m_GPUMatrix->AddToRowRepeatValuesOf(*a.m_GPUMatrix, numRepeats),
+            m_CPUMatrix->AddToRowRepeatValuesOf(*a.m_CPUMatrix, numRepeats),
+            m_GPUMatrix->AddToRowRepeatValuesOf(*a.m_GPUMatrix, numRepeats),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -1885,8 +1877,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber),
-            this->m_GPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber),
+            m_CPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber),
+            m_GPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -1908,8 +1900,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber),
-            this->m_GPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber),
+            m_CPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber),
+            m_GPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -1928,8 +1920,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignDifferenceOf(alpha,*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignDifferenceOf(alpha,*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignDifferenceOf(alpha,*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignDifferenceOf(alpha,*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -1948,8 +1940,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignDifferenceOf(*a.m_CPUMatrix, alpha), 
-            this->m_GPUMatrix->AssignDifferenceOf(*a.m_GPUMatrix, alpha), 
+            m_CPUMatrix->AssignDifferenceOf(*a.m_CPUMatrix, alpha), 
+            m_GPUMatrix->AssignDifferenceOf(*a.m_GPUMatrix, alpha), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -1969,8 +1961,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            *this->m_CPUMatrix -= *a.m_CPUMatrix, 
-            *this->m_GPUMatrix -= *a.m_GPUMatrix, 
+            *m_CPUMatrix -= *a.m_CPUMatrix, 
+            *m_GPUMatrix -= *a.m_GPUMatrix, 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2019,7 +2011,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     Matrix<ElemType> Matrix<ElemType>::operator* (ElemType alpha) const
     {
-        Matrix<ElemType> c(GetNumRows(), GetNumCols(), (DEVICEID_TYPE)this->m_preferredDeviceId);
+        Matrix<ElemType> c(GetNumRows(), GetNumCols(), (DEVICEID_TYPE)m_preferredDeviceId);
         Scale(alpha, *this, c);
         return c;
     }
@@ -2081,7 +2073,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             DISPATCH_MATRIX_ON_FLAG(this,
                 nullptr,
                 c.AssignProductOf((*this)(0,0), a), 
-                c.AssignProductOf(this->m_GPUMatrix->Get00Element(), a),       // BUGBUG: efficiency
+                c.AssignProductOf(m_GPUMatrix->Get00Element(), a),       // BUGBUG: efficiency
                 c.AssignProductOf((*this)(0,0), a), 
                 NOT_IMPLEMENTED
                 );
@@ -2104,7 +2096,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         else
         {
-            Matrix<ElemType> c(this->GetNumRows(), a.GetNumCols(), (DEVICEID_TYPE)GetPreferredDeviceId());
+            Matrix<ElemType> c(GetNumRows(), a.GetNumCols(), (DEVICEID_TYPE)GetPreferredDeviceId());
             Multiply(*this, a, c);
             return c;
         }
@@ -2185,15 +2177,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (!(a.GetMatrixType() == b.GetMatrixType()))
             NOT_IMPLEMENTED;
 
-        this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
+        SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignElementProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignElementProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), 
+            m_CPUMatrix->AssignElementProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), 
+            m_GPUMatrix->AssignElementProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
+
         return *this;
     }
 
@@ -2217,8 +2210,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            this->m_CPUMatrix->AddElementProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), 
-            this->m_GPUMatrix->AddElementProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), 
+            m_CPUMatrix->AddElementProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), 
+            m_GPUMatrix->AddElementProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2246,8 +2239,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignElementDivisionOf(*a.m_CPUMatrix,*b.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignElementDivisionOf(*a.m_GPUMatrix,*b.m_GPUMatrix), 
+            m_CPUMatrix->AssignElementDivisionOf(*a.m_CPUMatrix,*b.m_CPUMatrix), 
+            m_GPUMatrix->AssignElementDivisionOf(*a.m_GPUMatrix,*b.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2274,8 +2267,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->ColumnElementMultiplyWith(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->ColumnElementMultiplyWith(*a.m_GPUMatrix), 
+            m_CPUMatrix->ColumnElementMultiplyWith(*a.m_CPUMatrix), 
+            m_GPUMatrix->ColumnElementMultiplyWith(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2300,8 +2293,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->RowElementMultiplyWith(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->RowElementMultiplyWith(*a.m_GPUMatrix), 
+            m_CPUMatrix->RowElementMultiplyWith(*a.m_CPUMatrix), 
+            m_GPUMatrix->RowElementMultiplyWith(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2326,8 +2319,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->RowElementDivideBy(*a.m_CPUMatrix),
-            this->m_GPUMatrix->RowElementDivideBy(*a.m_GPUMatrix),
+            m_CPUMatrix->RowElementDivideBy(*a.m_CPUMatrix),
+            m_GPUMatrix->RowElementDivideBy(*a.m_GPUMatrix),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -2354,8 +2347,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->ColumnElementDivideBy(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->ColumnElementDivideBy(*a.m_GPUMatrix), 
+            m_CPUMatrix->ColumnElementDivideBy(*a.m_CPUMatrix), 
+            m_GPUMatrix->ColumnElementDivideBy(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2370,10 +2363,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->ElementInverse(), 
-            this->m_GPUMatrix->ElementInverse(), 
+            m_CPUMatrix->ElementInverse(), 
+            m_GPUMatrix->ElementInverse(), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->ElementInverse()
+            m_GPUSparseMatrix->ElementInverse()
             );
                 
         return (*this);
@@ -2386,14 +2379,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             LogicError("AssignElementInverseOf: Matrix a is empty.");
 
         DecideAndMoveToRightDevice(a, *this);        
-        this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
+        SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignElementInverseOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignElementInverseOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignElementInverseOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignElementInverseOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignElementInverseOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignElementInverseOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2404,10 +2397,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceSigmoid(), 
-            this->m_GPUMatrix->InplaceSigmoid(), 
+            m_CPUMatrix->InplaceSigmoid(), 
+            m_GPUMatrix->InplaceSigmoid(), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->InplaceSigmoid()
+            m_GPUSparseMatrix->InplaceSigmoid()
             );
                 
         return (*this);
@@ -2421,10 +2414,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignSigmoidOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignSigmoidOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignSigmoidOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignSigmoidOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignSigmoidOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignSigmoidOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2436,10 +2429,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceLinearRectifierDerivative(), 
-            this->m_GPUMatrix->InplaceLinearRectifierDerivative(), 
+            m_CPUMatrix->InplaceLinearRectifierDerivative(), 
+            m_GPUMatrix->InplaceLinearRectifierDerivative(), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->InplaceLinearRectifierDerivative()
+            m_GPUSparseMatrix->InplaceLinearRectifierDerivative()
             );
                 
         return (*this);
@@ -2453,10 +2446,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2468,8 +2461,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceSigmoidDerivative(), 
-            this->m_GPUMatrix->InplaceSigmoidDerivative(), 
+            m_CPUMatrix->InplaceSigmoidDerivative(), 
+            m_GPUMatrix->InplaceSigmoidDerivative(), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2485,8 +2478,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignSigmoidDerivativeOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignSigmoidDerivativeOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignSigmoidDerivativeOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignSigmoidDerivativeOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2506,8 +2499,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix, searchInCol), 
-            this->m_GPUMatrix->AssignNumOfDiff(*a.m_GPUMatrix, *b.m_GPUMatrix, searchInCol), 
+            m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix, searchInCol), 
+            m_GPUMatrix->AssignNumOfDiff(*a.m_GPUMatrix, *b.m_GPUMatrix, searchInCol), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2520,10 +2513,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceTanh(), 
-            this->m_GPUMatrix->InplaceTanh(), 
+            m_CPUMatrix->InplaceTanh(), 
+            m_GPUMatrix->InplaceTanh(), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->InplaceTanh()
+            m_GPUSparseMatrix->InplaceTanh()
             );
                 
         return (*this);        
@@ -2537,10 +2530,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignTanhOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignTanhOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignTanhOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignTanhOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignTanhOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignTanhOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2552,8 +2545,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceLogSoftmax(isColWise), 
-            this->m_GPUMatrix->InplaceLogSoftmax(isColWise), 
+            m_CPUMatrix->InplaceLogSoftmax(isColWise), 
+            m_GPUMatrix->InplaceLogSoftmax(isColWise), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2571,8 +2564,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignLogSoftmaxOf(*a.m_CPUMatrix,isColWise), 
-            this->m_GPUMatrix->AssignLogSoftmaxOf(*a.m_GPUMatrix,isColWise), 
+            m_CPUMatrix->AssignLogSoftmaxOf(*a.m_CPUMatrix,isColWise), 
+            m_GPUMatrix->AssignLogSoftmaxOf(*a.m_GPUMatrix,isColWise), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2586,8 +2579,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceHardmax(isColWise),
-            this->m_GPUMatrix->InplaceHardmax(isColWise),
+            m_CPUMatrix->InplaceHardmax(isColWise),
+            m_GPUMatrix->InplaceHardmax(isColWise),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -2605,8 +2598,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignHardmaxOf(*a.m_CPUMatrix, isColWise),
-            this->m_GPUMatrix->AssignHardmaxOf(*a.m_GPUMatrix, isColWise),
+            m_CPUMatrix->AssignHardmaxOf(*a.m_CPUMatrix, isColWise),
+            m_GPUMatrix->AssignHardmaxOf(*a.m_GPUMatrix, isColWise),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -2619,10 +2612,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceSqrt(), 
-            this->m_GPUMatrix->InplaceSqrt(), 
+            m_CPUMatrix->InplaceSqrt(), 
+            m_GPUMatrix->InplaceSqrt(), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->InplaceSqrt()
+            m_GPUSparseMatrix->InplaceSqrt()
             );
                 
         return *this;        
@@ -2639,10 +2632,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignSqrtOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignSqrtOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignSqrtOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignSqrtOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignSqrtOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignSqrtOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2654,10 +2647,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceExp(), 
-            this->m_GPUMatrix->InplaceExp(), 
+            m_CPUMatrix->InplaceExp(), 
+            m_GPUMatrix->InplaceExp(), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->InplaceExp()
+            m_GPUSparseMatrix->InplaceExp()
             );
                 
         return *this;        
@@ -2675,10 +2668,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignExpOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignExpOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignExpOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignExpOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignExpOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignExpOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2690,10 +2683,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            this->m_CPUMatrix->InplaceAbs(), 
-            this->m_GPUMatrix->InplaceAbs(), 
+            m_CPUMatrix->InplaceAbs(), 
+            m_GPUMatrix->InplaceAbs(), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->InplaceAbs()
+            m_GPUSparseMatrix->InplaceAbs()
             );
                 
         return *this;        
@@ -2710,10 +2703,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignAbsOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignAbsOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignAbsOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignAbsOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignAbsOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignAbsOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2725,10 +2718,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceLog(), 
-            this->m_GPUMatrix->InplaceLog(), 
+            m_CPUMatrix->InplaceLog(), 
+            m_GPUMatrix->InplaceLog(), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->InplaceLog()
+            m_GPUSparseMatrix->InplaceLog()
             );
                 
         return *this;           
@@ -2740,7 +2733,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceLog10(), 
+            m_CPUMatrix->InplaceLog10(), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
@@ -2760,10 +2753,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignLogOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignLogOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignLogOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignLogOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2780,10 +2773,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignLog10Of(*a.m_CPUMatrix), 
+            m_CPUMatrix->AssignLog10Of(*a.m_CPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix)
+            m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix)
             );
                 
         return *this;
@@ -2795,8 +2788,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceCosine(), 
-            this->m_GPUMatrix->InplaceCosine(), 
+            m_CPUMatrix->InplaceCosine(), 
+            m_GPUMatrix->InplaceCosine(), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2815,8 +2808,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignCosineOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignCosineOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignCosineOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignCosineOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2830,8 +2823,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceNegativeSine(), 
-            this->m_GPUMatrix->InplaceNegativeSine(), 
+            m_CPUMatrix->InplaceNegativeSine(), 
+            m_GPUMatrix->InplaceNegativeSine(), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2850,8 +2843,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignNegativeSineOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignNegativeSineOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignNegativeSineOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignNegativeSineOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -2878,10 +2871,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceTruncate(threshold), 
-            this->m_GPUMatrix->InplaceTruncate(threshold),
-            this->m_CPUSparseMatrix->InplaceTruncate(threshold),
-            this->m_GPUSparseMatrix->InplaceTruncate(threshold)
+            m_CPUMatrix->InplaceTruncate(threshold), 
+            m_GPUMatrix->InplaceTruncate(threshold),
+            m_CPUSparseMatrix->InplaceTruncate(threshold),
+            m_GPUSparseMatrix->InplaceTruncate(threshold)
             );
 
         return *this;
@@ -2898,7 +2891,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED,
-            this->m_GPUSparseMatrix->InplaceTranspose()
+            m_GPUSparseMatrix->InplaceTranspose()
             );
     }
 
@@ -2915,10 +2908,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceSoftThreshold(threshold),
-            this->m_GPUMatrix->InplaceSoftThreshold(threshold),
-            this->m_CPUSparseMatrix->InplaceSoftThreshold(threshold),
-            this->m_GPUSparseMatrix->InplaceSoftThreshold(threshold)
+            m_CPUMatrix->InplaceSoftThreshold(threshold),
+            m_GPUMatrix->InplaceSoftThreshold(threshold),
+            m_CPUSparseMatrix->InplaceSoftThreshold(threshold),
+            m_GPUSparseMatrix->InplaceSoftThreshold(threshold)
             );
 
         return *this;
@@ -2943,10 +2936,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceTruncateBottom(threshold), 
-            this->m_GPUMatrix->InplaceTruncateBottom(threshold), 
-            this->m_CPUSparseMatrix->InplaceTruncateBottom(threshold),
-            this->m_GPUSparseMatrix->InplaceTruncateBottom(threshold)
+            m_CPUMatrix->InplaceTruncateBottom(threshold), 
+            m_GPUMatrix->InplaceTruncateBottom(threshold), 
+            m_CPUSparseMatrix->InplaceTruncateBottom(threshold),
+            m_GPUSparseMatrix->InplaceTruncateBottom(threshold)
             );
                 
         return *this;
@@ -2974,17 +2967,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 (*this) = a;
                 return *this;
             }
-            }
+        }
 
         DecideAndMoveToRightDevice(a, *this);        
         SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignTruncateBottomOf(*a.m_CPUMatrix, threshold), 
-            this->m_GPUMatrix->AssignTruncateBottomOf(*a.m_GPUMatrix, threshold), 
+            m_CPUMatrix->AssignTruncateBottomOf(*a.m_CPUMatrix, threshold), 
+            m_GPUMatrix->AssignTruncateBottomOf(*a.m_GPUMatrix, threshold), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignTruncateBottomOf(*a.m_GPUSparseMatrix, threshold)
+            m_GPUSparseMatrix->AssignTruncateBottomOf(*a.m_GPUSparseMatrix, threshold)
             );
                 
         return *this;
@@ -3010,10 +3003,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->InplaceTruncateTop(threshold), 
-            this->m_GPUMatrix->InplaceTruncateTop(threshold), 
-            this->m_CPUSparseMatrix->InplaceTruncateTop(threshold),
-            this->m_GPUSparseMatrix->InplaceTruncateTop(threshold)
+            m_CPUMatrix->InplaceTruncateTop(threshold), 
+            m_GPUMatrix->InplaceTruncateTop(threshold), 
+            m_CPUSparseMatrix->InplaceTruncateTop(threshold),
+            m_GPUSparseMatrix->InplaceTruncateTop(threshold)
             );
                 
         return *this;
@@ -3032,7 +3025,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 (*this) = a;
                 return *this;
             }
-            }
+        }
         else
         {
             if (!isfinite(threshold))
@@ -3040,17 +3033,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 (*this) = a;
                 return *this;
             }
-            }
+        }
 
         DecideAndMoveToRightDevice(a, *this);        
         SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignTruncateTopOf(*a.m_CPUMatrix, threshold), 
-            this->m_GPUMatrix->AssignTruncateTopOf(*a.m_GPUMatrix, threshold), 
+            m_CPUMatrix->AssignTruncateTopOf(*a.m_CPUMatrix, threshold), 
+            m_GPUMatrix->AssignTruncateTopOf(*a.m_GPUMatrix, threshold), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->AssignTruncateTopOf(*a.m_GPUSparseMatrix, threshold)
+            m_GPUSparseMatrix->AssignTruncateTopOf(*a.m_GPUSparseMatrix, threshold)
             );
                 
         return *this;
@@ -3065,10 +3058,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->SetToZeroIfAbsLessThan(threshold), 
-            this->m_GPUMatrix->SetToZeroIfAbsLessThan(threshold), 
+            m_CPUMatrix->SetToZeroIfAbsLessThan(threshold), 
+            m_GPUMatrix->SetToZeroIfAbsLessThan(threshold), 
             NOT_IMPLEMENTED, 
-            this->m_GPUSparseMatrix->SetToZeroIfAbsLessThan(threshold)
+            m_GPUSparseMatrix->SetToZeroIfAbsLessThan(threshold)
             );
                 
         return *this;
@@ -3083,13 +3076,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            return this->m_CPUMatrix->SumOfElements(), 
-            return this->m_GPUMatrix->SumOfElements(), 
-            return this->m_CPUSparseMatrix->SumOfElements(),
-            return this->m_GPUSparseMatrix->SumOfElements()
+            return m_CPUMatrix->SumOfElements(), 
+            return m_GPUMatrix->SumOfElements(), 
+            return m_CPUSparseMatrix->SumOfElements(),
+            return m_GPUSparseMatrix->SumOfElements()
             );
-                
-            }
+    }
 
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignSumOfElements(const Matrix<ElemType>& a)
@@ -3097,7 +3089,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (a.IsEmpty())
             LogicError("AssignSumOfElements: Matrix a is empty.");        
 
-        //WARNING: a and this must have same type
+        // WARNING: a and this must have same type
         if (!(GetMatrixType() == a.GetMatrixType()))
             NOT_IMPLEMENTED;
 
@@ -3105,8 +3097,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignSumOfElements(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignSumOfElements(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignSumOfElements(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignSumOfElements(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -3117,18 +3109,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     DeviceBoundNumber<ElemType> Matrix<ElemType>::Sum_AsDeviceBoundNum() const
     {
-                DeviceBoundNumber<ElemType> result;
+        DeviceBoundNumber<ElemType> result;
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            ElemType* val = new ElemType; *val = this->m_CPUMatrix->SumOfElements(); result.ShallowCopyFrom(val,-1); return result, 
+            ElemType* val = new ElemType; *val = m_CPUMatrix->SumOfElements(); result.ShallowCopyFrom(val,-1); return result, 
             return m_GPUMatrix->Sum_AsDeviceBoundNum(), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
-            );    
-
-        return result;
-            }
+            );
+    }
 
     //sum of all elements
     template<class ElemType>
@@ -3139,12 +3129,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            return this->m_CPUMatrix->SumOfAbsElements(), 
-            return this->m_GPUMatrix->SumOfAbsElements(), 
+            return m_CPUMatrix->SumOfAbsElements(), 
+            return m_GPUMatrix->SumOfAbsElements(), 
             NOT_IMPLEMENTED, 
-            return this->m_GPUSparseMatrix->SumOfAbsElements()
+            return m_GPUSparseMatrix->SumOfAbsElements()
             );                
-            }
+    }
 
     //sum of all elements
     template<class ElemType>
@@ -3155,8 +3145,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            return this->m_CPUMatrix->LogAddSumOfElements(),
-            return this->m_GPUMatrix->LogAddSumOfElements(),
+            return m_CPUMatrix->LogAddSumOfElements(),
+            return m_GPUMatrix->LogAddSumOfElements(),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -3195,12 +3185,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             &c,
-            this->m_CPUMatrix->VectorNorm1(*c.m_CPUMatrix,isColWise), 
-            this->m_GPUMatrix->VectorNorm1(*c.m_GPUMatrix,isColWise), 
+            m_CPUMatrix->VectorNorm1(*c.m_CPUMatrix,isColWise), 
+            m_GPUMatrix->VectorNorm1(*c.m_GPUMatrix,isColWise), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );                
-        }
+    }
 
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignVectorNorm1Of(Matrix<ElemType>& a, const bool isColWise)
@@ -3220,12 +3210,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             &c,
-            this->m_CPUMatrix->VectorNorm2(*c.m_CPUMatrix,isColWise), 
-            this->m_GPUMatrix->VectorNorm2(*c.m_GPUMatrix,isColWise), 
+            m_CPUMatrix->VectorNorm2(*c.m_CPUMatrix,isColWise), 
+            m_GPUMatrix->VectorNorm2(*c.m_GPUMatrix,isColWise), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );                
-        }
+    }
 
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignVectorNorm2Of(Matrix<ElemType>& a, const bool isColWise)
@@ -3245,12 +3235,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             &c,
-            this->m_CPUMatrix->VectorNormInf(*c.m_CPUMatrix,isColWise), 
-            this->m_GPUMatrix->VectorNormInf(*c.m_GPUMatrix,isColWise), 
+            m_CPUMatrix->VectorNormInf(*c.m_CPUMatrix,isColWise), 
+            m_GPUMatrix->VectorNormInf(*c.m_GPUMatrix,isColWise), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );                
-        }
+    }
 
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignVectorNormInfOf(Matrix<ElemType>& a, const bool isColWise)
@@ -3286,8 +3276,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignKhatriRaoProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignKhatriRaoProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), 
+            m_CPUMatrix->AssignKhatriRaoProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), 
+            m_GPUMatrix->AssignKhatriRaoProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -3318,8 +3308,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AddColumnReshapeProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix, transposeAColumn), 
-            this->m_GPUMatrix->AddColumnReshapeProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix, transposeAColumn), 
+            m_CPUMatrix->AddColumnReshapeProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix, transposeAColumn), 
+            m_GPUMatrix->AddColumnReshapeProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix, transposeAColumn), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -3342,12 +3332,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            return this->m_CPUMatrix->FrobeniusNorm(), 
-            return this->m_GPUMatrix->FrobeniusNorm(), 
-            return this->m_CPUSparseMatrix->FrobeniusNorm(),
-            return this->m_GPUSparseMatrix->FrobeniusNorm()
+            return m_CPUMatrix->FrobeniusNorm(), 
+            return m_GPUMatrix->FrobeniusNorm(), 
+            return m_CPUSparseMatrix->FrobeniusNorm(),
+            return m_GPUSparseMatrix->FrobeniusNorm()
             );                
-        }
+    }
 
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignFrobeniusNormOf(const Matrix<ElemType>& a)
@@ -3355,7 +3345,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (a.IsEmpty())
             LogicError("AssignFrobeniusNormOf: Matrix a is empty.");
 
-        this->Resize(1,1);
+        Resize(1,1);
 
         //WARNING: a and this must have same type
         if (! (GetMatrixType() == a.GetMatrixType()))
@@ -3365,8 +3355,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignFrobeniusNormOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignFrobeniusNormOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignFrobeniusNormOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignFrobeniusNormOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -3382,12 +3372,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            return this->m_CPUMatrix->MatrixNormInf(), 
-            return this->m_GPUMatrix->MatrixNormInf(), 
+            return m_CPUMatrix->MatrixNormInf(), 
+            return m_GPUMatrix->MatrixNormInf(), 
             NOT_IMPLEMENTED, 
-            return this->m_GPUSparseMatrix->MatrixNormInf()
+            return m_GPUSparseMatrix->MatrixNormInf()
             );                
-        }
+    }
 
     template<class ElemType>
     ElemType Matrix<ElemType>::MatrixNorm1() const
@@ -3397,13 +3387,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            return this->m_CPUMatrix->MatrixNorm1(), 
-            return this->m_GPUMatrix->MatrixNorm1(), 
+            return m_CPUMatrix->MatrixNorm1(), 
+            return m_GPUMatrix->MatrixNorm1(), 
             NOT_IMPLEMENTED, 
-            return this->m_GPUSparseMatrix->MatrixNorm1()
+            return m_GPUSparseMatrix->MatrixNorm1()
             );
-                
-        }
+    }
 
     template<class ElemType>
     ElemType Matrix<ElemType>::MatrixNorm0() const
@@ -3413,12 +3402,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            return this->m_CPUMatrix->MatrixNorm0(), 
-            return this->m_GPUMatrix->MatrixNorm0(), 
+            return m_CPUMatrix->MatrixNorm0(), 
+            return m_GPUMatrix->MatrixNorm0(), 
             NOT_IMPLEMENTED, 
-            return this->m_GPUSparseMatrix->MatrixNorm0()
+            return m_GPUSparseMatrix->MatrixNorm0()
             );               
-        }
+    }
 
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignSignOf(const Matrix<ElemType>& a)
@@ -3435,8 +3424,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AssignSignOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AssignSignOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AssignSignOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AssignSignOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -3456,8 +3445,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(&a,
             this,
-            this->m_CPUMatrix->AddSignOf(*a.m_CPUMatrix), 
-            this->m_GPUMatrix->AddSignOf(*a.m_GPUMatrix), 
+            m_CPUMatrix->AddSignOf(*a.m_CPUMatrix), 
+            m_GPUMatrix->AddSignOf(*a.m_GPUMatrix), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
@@ -3478,12 +3467,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             &maxValues,
-            this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise); maxIndexes.SetDataLocation(CPU, DENSE),
-            this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise); maxIndexes.SetDataLocation(GPU, DENSE),
+            m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise); maxIndexes.SetDataLocation(CPU, DENSE),
+            m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise); maxIndexes.SetDataLocation(GPU, DENSE),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
-
     }
 
     template<class ElemType>
@@ -3498,8 +3486,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             &maxValues,
-            this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(CPU, DENSE),
-            this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(GPU, DENSE),
+            m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(CPU, DENSE),
+            m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(GPU, DENSE),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -3517,12 +3505,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             &minValues,
-            this->m_CPUMatrix->VectorMin(*minIndexes.m_CPUMatrix,*minValues.m_CPUMatrix,isColWise); minIndexes.SetDataLocation(CPU, DENSE), 
-            this->m_GPUMatrix->VectorMin(*minIndexes.m_GPUMatrix,*minValues.m_GPUMatrix,isColWise); minIndexes.SetDataLocation(GPU, DENSE), 
+            m_CPUMatrix->VectorMin(*minIndexes.m_CPUMatrix,*minValues.m_CPUMatrix,isColWise); minIndexes.SetDataLocation(CPU, DENSE), 
+            m_GPUMatrix->VectorMin(*minIndexes.m_GPUMatrix,*minValues.m_GPUMatrix,isColWise); minIndexes.SetDataLocation(GPU, DENSE), 
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
-                
         }
 
 #pragma endregion Member BLAS Functions
@@ -3532,7 +3519,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     wchar_t* Matrix<ElemType>::GetMatrixName() const
     {
-        return this->m_baseMatrix->GetMatrixName();
+        return m_baseMatrix->GetMatrixName();
     }
 
     template<class ElemType>
@@ -3542,23 +3529,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             if (GetMatrixType() == MatrixType::DENSE)
             {
-                this->m_CPUMatrix->SetMatrixName(s);
-                this->m_GPUMatrix->SetMatrixName(s);
+                m_CPUMatrix->SetMatrixName(s);
+                m_GPUMatrix->SetMatrixName(s);
         }
             else if (GetMatrixType() == MatrixType::SPARSE)
             {
-                this->m_CPUSparseMatrix->SetMatrixName(s);
-                this->m_GPUSparseMatrix->SetMatrixName(s);
+                m_CPUSparseMatrix->SetMatrixName(s);
+                m_GPUSparseMatrix->SetMatrixName(s);
             }
         }
         else
         {
             DISPATCH_MATRIX_ON_FLAG(this,
                 nullptr,
-                this->m_CPUMatrix->SetMatrixName(s), 
-                this->m_GPUMatrix->SetMatrixName(s), 
-                this->m_CPUSparseMatrix->SetMatrixName(s), 
-                this->m_GPUSparseMatrix->SetMatrixName(s)
+                m_CPUMatrix->SetMatrixName(s), 
+                m_GPUMatrix->SetMatrixName(s), 
+                m_CPUSparseMatrix->SetMatrixName(s), 
+                m_GPUSparseMatrix->SetMatrixName(s)
                 );
         }
     }
@@ -3578,8 +3565,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             );                
     }
 
-    //if different and prefered devices are the same, move to preferred device. 
-    //other wise GPU>CPU and if both are GPU move to a's preferred device
+    // bring two matrices onto the same device
+    // If different and prefered devices are the same, move to preferred device. 
+    // Otherwise GPU takes precedence over CPU, and if both are GPU move to a's device.
+    // The inputs are only distinguished in that a's GPU takes precedence over b's in case they differ.
+    // TODO: This is called somewhat inconsistently, sometimes with a=*this, sometimes with b=*this.
+    template<class ElemType>
+    void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType> &a, const Matrix<ElemType> &b)
+    {
+        int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId();
+        if (deviceIdA == deviceIdB)
+            return;
+
+        int preferredDeviceIdA = a.GetPreferredDeviceId(), preferredDeviceIdB = b.GetPreferredDeviceId();
+
+        if (preferredDeviceIdA == preferredDeviceIdB)   // both prefer the same device: move to preferred
+        {
+            a._transferToDevice(preferredDeviceIdA);
+            b._transferToDevice(preferredDeviceIdA);
+        }
+        else if (deviceIdA != CPUDEVICE)                // one of them lives on GPU: use that
+        {
+            b._transferToDevice(deviceIdA);
+        }
+        else
+        {
+            a._transferToDevice(deviceIdB);
+        }
+    }
+
+    // same but for 3 matrices
+    // If b and c are both on the same GPU then a will be forced to go there; otherwise a's GPU takes precedence, then b's.
     template<class ElemType>
     void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType> &a, const Matrix<ElemType> &b, const Matrix<ElemType> &c)
     {
@@ -3589,17 +3605,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         int preferredDeviceIdA = a.GetPreferredDeviceId(), preferredDeviceIdB = b.GetPreferredDeviceId(), preferredDeviceIdC = c.GetPreferredDeviceId();
 
-        if (preferredDeviceIdA == preferredDeviceIdB && preferredDeviceIdA == preferredDeviceIdC) //move to preferred
+        if (preferredDeviceIdA == preferredDeviceIdB && preferredDeviceIdA == preferredDeviceIdC)
         {
             a._transferToDevice(preferredDeviceIdA);
             b._transferToDevice(preferredDeviceIdA);
             c._transferToDevice(preferredDeviceIdA);
         }
-        else if (deviceIdB == deviceIdC && deviceIdB != CPUDEVICE)
+        else if (deviceIdB == deviceIdC && deviceIdB != CPUDEVICE)  // TODO: why not the other two combinations?
         {
-            a._transferToDevice(deviceIdB);
+            a._transferToDevice(deviceIdB);             // 'a' is outvoted
         }
-        else if (deviceIdA != CPUDEVICE) //use it
+        else if (deviceIdA != CPUDEVICE)                // one of them lives on GPU: use that
         {
             b._transferToDevice(deviceIdA);
             c._transferToDevice(deviceIdA);
@@ -3616,30 +3632,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
-    //if different and prefered devices are the same, move to preferred device. 
-    //other wise GPU>CPU and if both are GPU move to a's preferred device
+    // same but for 4 matrices
     template<class ElemType>
-    void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType> &a, const Matrix<ElemType> &b)
+    void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType> &a, const Matrix<ElemType> &b, const Matrix<ElemType> &c, const Matrix<ElemType> &d)
     {
-        int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId();
-        if (deviceIdA == deviceIdB)
-            return;
-
-        int preferredDeviceIdA = a.GetPreferredDeviceId(), preferredDeviceIdB = b.GetPreferredDeviceId();
-
-        if (preferredDeviceIdA == preferredDeviceIdB) //move to preferred
-        {
-            a._transferToDevice(preferredDeviceIdA);
-            b._transferToDevice(preferredDeviceIdA);
-        }
-        else if (deviceIdA != CPUDEVICE) //use it
-            {
-            b._transferToDevice(deviceIdA);
-            }
-            else 
-            {                             
-            a._transferToDevice(deviceIdB);
-        }
+        // this function is only called for one operator, so for now we keep it imple
+        DecideAndMoveToRightDevice(a, b, c);
+        d._transferToDevice(a.GetDeviceId());   // BUGBUG: Is this correct in case a,b,c share the same preferredDevice?
     }
 
     template<class ElemType>
@@ -3649,7 +3648,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (to_id == from_id)  //nothing to do
             return;
 
-        if (this->OwnBuffer())
+        if (OwnBuffer())
             _transferFromDeviceToDevice(from_id,  to_id, ismoved, emptyTransfer);
         else
             RuntimeError("Cannot move externally owned matrices to the preferred device.");
@@ -3844,16 +3843,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         if (IsEmpty())
             LogicError("Print: Matrix is empty.");
-        DEVICEID_TYPE orgdevice = this->GetDeviceId();
+        DEVICEID_TYPE orgdevice = GetDeviceId();
 
         DISPATCH_MATRIX_ON_FLAG(this,
             nullptr,
-            this->m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd), 
-            _transferToDevice(CPUDEVICE, false, false); this->m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd); _transferToDevice(orgdevice, false, false), 
-            this->m_CPUSparseMatrix->Print(matrixName), 
-            _transferToDevice(CPUDEVICE, false, false); this->m_CPUSparseMatrix->Print(matrixName); _transferToDevice(orgdevice, false, false)
+            m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd), 
+            _transferToDevice(CPUDEVICE, false, false); m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd); _transferToDevice(orgdevice, false, false), 
+            m_CPUSparseMatrix->Print(matrixName), 
+            _transferToDevice(CPUDEVICE, false, false); m_CPUSparseMatrix->Print(matrixName); _transferToDevice(orgdevice, false, false)
             );
-                
     }
 
     template<class ElemType>
@@ -4009,11 +4007,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignSoftmaxSum(const Matrix<ElemType>& a, const Matrix<ElemType>& softmax)
     {
-        this->Resize(1, 1);
-        if (this->GetDeviceId() < 0)
-            a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *this->m_CPUMatrix);
+        Resize(1, 1);
+        if (GetDeviceId() < 0)
+            a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *m_CPUMatrix);
         else
-            a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *this->m_GPUMatrix);
+            a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *m_GPUMatrix);
         return *this;
     }
 
@@ -4023,11 +4021,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //if (a.GetMatrixType() != MatrixType::SPARSE)
         //    NOT_IMPLEMENTED;
 
-        this->Resize(1, 1);
-        if (this->GetDeviceId() < 0)
-            a.m_CPUMatrix->AssignNCEUnnormalizedEval(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, *this->m_CPUMatrix);
+        Resize(1, 1);
+        if (GetDeviceId() < 0)
+            a.m_CPUMatrix->AssignNCEUnnormalizedEval(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, *m_CPUMatrix);
         else
-            a.m_GPUMatrix->AssignNCEUnnormalizedEval(*b.m_GPUMatrix, *c.m_GPUMatrix, *this->m_GPUMatrix);
+            a.m_GPUMatrix->AssignNCEUnnormalizedEval(*b.m_GPUMatrix, *c.m_GPUMatrix, *m_GPUMatrix);
         return *this;
     }
 
@@ -4037,24 +4035,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (a.IsEmpty() || b.IsEmpty() || c.IsEmpty())
             LogicError("AssignNoiseContrastiveEstimation: one of the input matrices is empty.");
 
-        if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != this->GetDeviceId())        
+        if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != GetDeviceId())        
             NOT_IMPLEMENTED;
         
-        this->Resize(1, 1);
+        Resize(1, 1);
 
-        if (this->GetDeviceId() < 0)
+        if (GetDeviceId() < 0)
         {
             size_t sampleCount = a.m_CPUMatrix->GetNumElements() / a.m_CPUMatrix->GetNumRows();
             tmp.Resize(a.GetNumRows() / 2, sampleCount);
             a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix,
-                *bias.m_CPUMatrix, *tmp.m_CPUMatrix, *this->m_CPUMatrix);
+                *bias.m_CPUMatrix, *tmp.m_CPUMatrix, *m_CPUMatrix);
         }
         else
         {
             size_t sampleCount = a.m_GPUMatrix->GetNumElements() / a.m_GPUMatrix->GetNumRows();
             tmp.Resize(a.GetNumRows() / 2, sampleCount);
             a.m_GPUMatrix->AssignNoiseContrastiveEstimation(*b.m_GPUMatrix, *c.m_GPUMatrix,
-                *bias.m_GPUMatrix, sampleCount, *tmp.m_GPUMatrix, *this->m_GPUMatrix);
+                *bias.m_GPUMatrix, sampleCount, *tmp.m_GPUMatrix, *m_GPUMatrix);
         }
         return *this;
     }
@@ -4065,13 +4063,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (a.IsEmpty() || b.IsEmpty() || c.IsEmpty())
             LogicError("AssignNoiseContrastiveEstimation: one of the input matrices is empty.");
 
-        if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != this->GetDeviceId())
+        if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != GetDeviceId())
             NOT_IMPLEMENTED;
 
         assert(tmp.GetNumRows() == a.GetNumRows() / 2);
-        if (this->GetDeviceId() < 0)
+        if (GetDeviceId() < 0)
         {
-            //samples                           gradient           hidden               embedding            embedding/hidden
+            // samples                         gradient          hidden          embedding                   embedding/hidden
             a.m_CPUMatrix->AssignNCEDerivative(*tmp.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, inputIndex, *m_CPUMatrix);
         }
         else
@@ -4496,7 +4494,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
-
     }
 
     /// <summary>c += alpha * (a-b)</summary>
@@ -4617,7 +4614,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             NOT_IMPLEMENTED, 
             *c.m_GPUSparseMatrix = (*a.m_GPUSparseMatrix)*alpha
             );
-                
     }
 
     /// <summary>Matrix-scalar multiply with col-major matrices: a = alpha * a</summary>
@@ -4636,7 +4632,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             NOT_IMPLEMENTED, 
             GPUSparseMatrix<ElemType>::Scale(alpha,*a.m_GPUSparseMatrix)
             );
-                
     }
 
     /// <summary>Matrix scalar matrix multiply with col-major matrices: a = alpha[0,0] * a</summary>
@@ -4660,7 +4655,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
-                
     }
 
     template<class ElemType>
@@ -4683,7 +4677,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             NOT_IMPLEMENTED, 
             NOT_IMPLEMENTED
             );
-                
     }
 
     template<class ElemType>
@@ -4695,7 +4688,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         DecideAndMoveToRightDevice(a,b);        
 
         if (a.GetMatrixType() == b.GetMatrixType())
-            {
+        {
             DISPATCH_MATRIX_ON_FLAG(&a,
                 nullptr,
                 return CPUMatrix<ElemType>::InnerProductOfMatrices(*a.m_CPUMatrix,*b.m_CPUMatrix), 
@@ -4703,9 +4696,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 NOT_IMPLEMENTED, 
                 NOT_IMPLEMENTED
                 );                
-            }
-            else
-            {
+        }
+        else
+        {
             DISPATCH_MATRIX_ON_FLAG(&a,
                 nullptr,
                 NOT_IMPLEMENTED, 
@@ -4722,7 +4715,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (a.IsEmpty() || b.IsEmpty())
             LogicError("InnerProductOfMatrices:  one of the input matrices is empty.");
 
-        this->Resize(1,1);       
+        Resize(1,1);       
 
         DecideAndMoveToRightDevice(a, b, *this);        
     
@@ -4732,8 +4725,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             DISPATCH_MATRIX_ON_FLAG(&a,
                 this,
-                this->m_CPUMatrix->SetValue(CPUMatrix<ElemType>::InnerProductOfMatrices(*a.m_CPUMatrix,*b.m_CPUMatrix)), 
-                this->m_GPUMatrix->AssignInnerProductOfMatrices(*a.m_GPUMatrix,*b.m_GPUMatrix), 
+                m_CPUMatrix->SetValue(CPUMatrix<ElemType>::InnerProductOfMatrices(*a.m_CPUMatrix,*b.m_CPUMatrix)), 
+                m_GPUMatrix->AssignInnerProductOfMatrices(*a.m_GPUMatrix,*b.m_GPUMatrix), 
                 NOT_IMPLEMENTED, 
                 NOT_IMPLEMENTED
                 );                
@@ -4955,12 +4948,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (!(a.GetMatrixType() == b.GetMatrixType()))
             NOT_IMPLEMENTED;
 
-        this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
+        SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber),
-            this->m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber),
+            m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber),
+            m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -5012,8 +5005,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index),
-            this->m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index),
+            m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index),
+            m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -5060,12 +5053,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (!(a.GetMatrixType() == b.GetMatrixType()))
             NOT_IMPLEMENTED;
 
-        this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
+        SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift),
-            this->m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift),
+            m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift),
+            m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -5139,12 +5132,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         if (label.GetNumCols() != gamma.GetNumCols() || label.GetNumRows() != gamma.GetNumRows())
             LogicError("DropFrame: label matrix is not in the same size as gamm matrix.");
-        this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);
+        SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold),
-            this->m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold),
+            m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold),
+            m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -5167,13 +5160,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (!(label.GetMatrixType() == gamma.GetMatrixType()))
             NOT_IMPLEMENTED;
 
-        this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);
-
+        SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
-            this->m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha),
-            this->m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha),
+            m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha),
+            m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );
@@ -5187,6 +5179,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                     const vector<size_t> & regularOpDims,  const array<vector<ptrdiff_t>, 2> & regularStrides,
                                     const vector<size_t> & reducingOpDims, const array<vector<ptrdiff_t>, 2> & reducingStrides)
     {
+        DecideAndMoveToRightDevice(*this, a);
+
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
             m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
@@ -5202,6 +5196,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                     const vector<size_t> & regularOpDims,  const array<vector<ptrdiff_t>, 3> & regularStrides,
                                     const vector<size_t> & reducingOpDims, const array<vector<ptrdiff_t>, 3> & reducingStrides)
     {
+        DecideAndMoveToRightDevice(*this, a, b);
+
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
             m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
@@ -5217,6 +5213,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                     const vector<size_t> & regularOpDims,  const array<vector<ptrdiff_t>, 4> & regularStrides,
                                     const vector<size_t> & reducingOpDims, const array<vector<ptrdiff_t>, 4> & reducingStrides)
     {
+        DecideAndMoveToRightDevice(*this, a, b, c);
+
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
             m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index b1a2aa9fa..6f4eaa26b 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -74,6 +74,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const; 
         static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
         static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
+        static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& d);
         static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);
 
     public:

From 9d33fc1efa6de42519469ab207549bf887b35d3f Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Dec 2015 10:55:30 -0800
Subject: [PATCH 16/19] added a specialization of a tensor op for inner
 dimensions where all strides are 1. Seems not quite enough for really
 efficient unrolling though

---
 Source/Math/CPUMatrix.cpp  | 145 ++++++++++++++++++++++---------------
 Source/Math/Matrix.h       |   6 +-
 Source/Math/TensorView.cpp |   9 ++-
 3 files changed, 98 insertions(+), 62 deletions(-)

diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index ca08faf71..9a9a52940 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -352,7 +352,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this;
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j = 0; j<n; j++)
         {
             //four-way unrolling
@@ -385,7 +385,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         long n = (long)a.GetNumCols();        // note: OpenMP requires loop indices to be long, not size_t
         long k = (long)a.GetNumRows();
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //memory copy might be faster?
@@ -429,7 +429,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this; 
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -470,7 +470,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this;
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j = 0; j<n; j++)
         {
             //four-way unrolling
@@ -501,7 +501,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this;
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long i = 0; i < m_numRows; i++)
         {
             diag(0, (size_t)i) = us(i, i);
@@ -539,7 +539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this;
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j = 0; j<sliceNumCols; j++)
         {
             for (int i = 0; i < inputMatrices.size(); i++)
@@ -576,7 +576,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         long n = (long)a.GetNumCols(), m = (long)a.GetNumRows();
         auto& us = *this;
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long q = 0; q < numColRepeats; q++)
         {
             for (long p = 0; p < numRowRepeats; p++)
@@ -620,7 +620,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this;
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j = 0; j<n; j++)
         {
             //four-way unrolling
@@ -686,7 +686,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this; 
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -720,7 +720,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         else
         {     
             long m=(long)GetNumElements();
-#pragma omp parallel for     
+#pragma omp parallel for
             //four-way unrolling
             for (long i=0; i<(m & ~3); i+=4)
             {
@@ -778,7 +778,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this; 
         long m=(long)GetNumRows();
-#pragma omp parallel for     
+#pragma omp parallel for
         //four-way unrolling
         for (long i=0; i<(m & ~3); i+=4)
         {
@@ -803,7 +803,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this; 
         long m=(long)GetNumRows();
-#pragma omp parallel for     
+#pragma omp parallel for
         //four-way unrolling
         for (long i=0; i<(m & ~3); i+=4)
         {
@@ -828,7 +828,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this; 
         long m=(long)GetNumRows();
-#pragma omp parallel for     
+#pragma omp parallel for
         //four-way unrolling
         for (long i=0; i<(m & ~3); i+=4)
         {
@@ -936,7 +936,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         auto& us = *this;
         long m=(long)GetNumRows();
-#pragma omp parallel for     
+#pragma omp parallel for
         //four-way unrolling
         for (long i=0; i<(m & ~3); i+=4)
         {
@@ -975,7 +975,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             long m=(long)GetNumRows();
             if (vector.GetNumRows() == 1) //row vector
             {
-#pragma omp parallel for     
+#pragma omp parallel for
                 //four-way unrolling
                 for (long i=0; i<(m & ~3); i+=4)
                 {
@@ -992,7 +992,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
             else
             {
-#pragma omp parallel for     
+#pragma omp parallel for
                 //four-way unrolling
                 for (long i=0; i<(m & ~3); i+=4)
                 {
@@ -1165,7 +1165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         ElemType a0, a1, a2, a3;
 
         //disable omp here because aveMultiper needs to be added atomically. however, it seems the result is incorrect even if rmp atomic and amp critical are used.
-//#pragma omp parallel for     
+//#pragma omp parallel for
         for (long i = 0; i<(n & ~3); i += 4)  //four-way unrolling
         {
             a[i] += d_v[i] * d_v[i];
@@ -1496,7 +1496,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -1597,7 +1597,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -1626,7 +1626,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -1817,7 +1817,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -1854,7 +1854,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         auto& us=*this;
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -1922,7 +1922,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         auto& us=*this;
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -1957,7 +1957,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         auto& us=*this;
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             ElemType v = a(0,j);
@@ -1992,7 +1992,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         auto& us = *this;
 
         long m = (long)GetNumRows(), n = (long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j = 0; j<n; j++)
         {
             ElemType v = a(0, j);
@@ -2033,7 +2033,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         long m=(long)GetNumRows(), n=(long)GetNumCols();
 
         ElemType smallValue = EPS_IN_INVERSE;
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             for (long i=0; i<m; i++)
@@ -2134,7 +2134,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -2173,7 +2173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -2221,7 +2221,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -2388,7 +2388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -2428,7 +2428,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -2468,7 +2468,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -2621,7 +2621,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         auto& us=*this;
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -2661,7 +2661,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         ElemType locTHresholdNeg = -locThresholdPos; 
 
         long m=(long)GetNumRows(), n=(long)GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j=0; j<n; j++)
         {
             //four-way unrolling
@@ -2709,7 +2709,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         long m = (long)GetNumElements();
 
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long i = 0; i<(m & ~3); i += 4)  //four-way unrolling
         {
             if (m_pArray[i] > threshold)
@@ -4388,7 +4388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             ElemType v = alpha*a(0,0);
             long m=(long)c.GetNumRows(), n=(long)c.GetNumCols();
-#pragma omp parallel for     
+#pragma omp parallel for
             for (long j=0; j<n; j++)
             {
                 //four-way unrolling
@@ -4498,7 +4498,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             LogicError("AddScaledDifference:  Input matrix a is empty.");
 
         long m=(long)c.GetNumElements();
-#pragma omp parallel for     
+#pragma omp parallel for
         //four-way unrolling
         for (long i=0; i<(m & ~3); i+=4)
         {
@@ -4537,7 +4537,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             c.Resize(a.GetNumRows(), a.GetNumCols());
 
         long m=(long)c.GetNumElements();
-#pragma omp parallel for     
+#pragma omp parallel for
         //four-way unrolling
         for (long i=0; i<(m & ~3); i+=4)
         {
@@ -4635,7 +4635,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         c.Resize(m,n);
 
         long size=(long)c.GetNumElements();
-#pragma omp parallel for     
+#pragma omp parallel for
         //four-way unrolling
         for (long i=0; i<(size & ~3); i+=4)
         {
@@ -4945,7 +4945,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool bHas = false;
 
         bool isvFinite = std::isfinite(v);
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j = 0; j < mat.GetNumElements(); j++)
         {
 #pragma omp flush(bHas)
@@ -4993,7 +4993,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         long m = (long)GetNumRows(), n = (long)GetNumCols();  // a and b are of size (1,n)
-        //#pragma omp parallel for     
+        //#pragma omp parallel for
 
         for (long j = 0; j < n; j++)
         {
@@ -5248,7 +5248,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         //long m = (long)GetNumRows(), n = (long)GetNumCols();  // a and b are of size (1,n)
         long n = (long)GetNumCols();  // a and b are of size (1,n)
-#pragma omp parallel for     
+#pragma omp parallel for
         for (long j = 0; j<n; j++)
         {
             us(0, j) = a(0, j) * b(0, (j + shift) % n);
@@ -5561,14 +5561,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     };
 
     // perform loop over regular index k and reducing index m for N operands (counting the output)
-    template<class ElemType, size_t N, typename OPFN, int m, int k>
+    template<class ElemType, size_t N, typename OPFN, bool vectorizable, int m, int k>
     struct TensorOpIteration
     {
         static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
                                 const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
                                 const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
         {
-            // TODO: if leading dim is all-ones, we can hard-code the loop and hope the compiler vectorizes for us
             // non-scalar case: still nested result loops left
             array<ptrdiff_t, N> strides;
             for (size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
@@ -5576,7 +5575,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;)
             {
                 // need to descend into one loop deeper
-                TensorOpIteration<ElemType, N, OPFN, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                TensorOpIteration<ElemType, N, OPFN, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
                 // advance the pointers
                 for (size_t i = 0; i < N; i++)
                     pointers[i] += strides[i];
@@ -5584,8 +5583,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     };
 
-    template<class ElemType, size_t N, typename OPFN, int m>
-    struct TensorOpIteration<ElemType, N, OPFN, m, -1>
+    // Special version: All innermost strides are 1, and there is no further reduction. Compiler can use SSE.
+    // This is a very common case, e.g. computing the Sigmoid.
+    template<class ElemType, size_t N, typename OPFN>
+    struct TensorOpIteration<ElemType, N, OPFN, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
+    {
+        static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+        {
+            size_t K = regularOpDims[0];
+#pragma omp parallel for
+            for (int k = 0; k < (int)K; k++)
+            {
+                // need to descend into one loop deeper
+                TensorOpIteration<ElemType, N, OPFN, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                // advance the pointers
+                for (size_t i = 0; i < N; i++)
+                    pointers[i] += 1;       // instead of strides[i];
+            }
+        }
+    };
+
+    template<class ElemType, size_t N, typename OPFN, bool vectorizable, int m>
+    struct TensorOpIteration<ElemType, N, OPFN, vectorizable, m, -1>
     {
         static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
                                 const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &,
@@ -5604,16 +5625,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     // tensor operation with k+1 dimensions (-1 means scalar)
     template<class ElemType, size_t N, typename OPFN, int k>
-    static inline void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, const OPFN & opfn,
-                                            const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
-                                            const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+    static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, const OPFN & opfn,
+                                        const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                                        const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
     {
         size_t dims = reducingOpDims.size();
         switch (dims)
         {
-        case 2: return TensorOpIteration<ElemType, N, OPFN, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 1: return TensorOpIteration<ElemType, N, OPFN, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 0: return TensorOpIteration<ElemType, N, OPFN, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 2: return TensorOpIteration<ElemType, N, OPFN, false/*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return TensorOpIteration<ElemType, N, OPFN, false/*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 0:
+            {
+                // if all leading dimensions are 1, we can let the compiler do some unrolling
+                bool leadingAllOne = true;
+                for (size_t i = 0; i < N; i++)
+                    leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
+                if (leadingAllOne)      // special version that uses a hard-coded increment of 1 for all leading dimensions
+                    return TensorOpIteration<ElemType, N, OPFN, true/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                else
+                    return TensorOpIteration<ElemType, N, OPFN, false/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            }
         default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims);
         }
     }
@@ -5621,10 +5652,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // tensor operation, generalized in number of arguments, operation already provided as a lambda
     // This function now expands into different k.
     template<class ElemType, typename OPFN, size_t N>
-    static inline void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
-                                        const std::array<size_t, N> & offsets,
-                                        const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
-                                        const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+    static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
+                               const std::array<size_t, N> & offsets,
+                               const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
+                               const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
     {
         for (size_t i = 0; i < N; i++)  // N = a small constant, this will be unrolled
             pointers[i] += offsets[i];
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 6f4eaa26b..837cb6834 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -6,9 +6,8 @@
 
 // TODO:
 //  - remove empty-matrix checks: if an op is well-defined with empty matrices, then do it
-//  - Resize() must be cheap if it does nothing  (I already did that for CPU, still to be done for GPU)
-//  - an overload for Resize() to match another matrix
-//  - need a way to grow a minibatch matrix without destroying its content, something like PushColumns()
+//  - Resize() must be cheap if it does nothing  (I already did that for CPU; already done for GPU?)
+
 #pragma once
 
 #include "Basics.h"
@@ -170,6 +169,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
        
         void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true);  //by default we only reallocate if need to grow        
+        void Resize(const Matrix<ElemType>& other) { Resize(other.GetNumRows(), other.GetNumCols()); }
         void VerifySize(size_t rows, size_t cols)
         {
             m_baseMatrix->VerifySize(rows, cols);
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index 0676b014d..e5626cd81 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -235,8 +235,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m2.SetValue(2, 1, { 42,
                                 13 });
 
-            // unary ops
-            m3.Resize(2, 3);
+            m3.Resize(m1);
+
+            // regular zip  (just add m1 to itself)
+            TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m1), 1);
+            m3.Print();
+
+            // unary op
             TensorView(m3).DoSqrtOf(0, TensorView(m1), 1);
             m3.Print();
 

From c343e98ae9a56850073f00ba64b4f67aac50c8b6 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Dec 2015 13:30:33 -0800
Subject: [PATCH 17/19] further optimized the most frequent tensor loops
 (1-stride loops for unary and binary ops), but still not seeing 4-way SSE
 parallelism

---
 Source/Math/CPUMatrix.cpp  | 103 +++++++++++++++++++++++++------------
 Source/Math/TensorView.cpp |  16 ++++--
 2 files changed, 80 insertions(+), 39 deletions(-)

diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 9a9a52940..cf670a73c 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -5525,7 +5525,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     // perform loop over reduction index m
     // This function is declared inside a wrapper struct to allow partial specialization (m = -1).
-    template<class ElemType, size_t N, typename OPFN, int m>
+    template<class ElemType, typename OPFN, size_t N, int m>
     struct TensorOpReduction
     {
         // reduction case (non-reduction case is specialized)
@@ -5539,7 +5539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;)
             {
                 // need to descend into one loop deeper
-                aggregate += TensorOpReduction<ElemType, N, OPFN, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
+                aggregate += TensorOpReduction<ElemType, OPFN, N, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
                 // advance the pointers
                 for (size_t i = 0; i < N - 1; i++)
                     pointers[i] += strides[i];      // note: last pointer (result) is unused and untouched here
@@ -5550,8 +5550,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     // perform loop over reduction index m
     // This is the specialized version for m = -1, which terminates the recursion.
-    template<class ElemType, size_t N, typename OPFN>
-    struct TensorOpReduction<ElemType, N, OPFN, -1>
+    template<class ElemType, typename OPFN, size_t N>
+    struct TensorOpReduction<ElemType, OPFN, N, -1>
     {
         static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN & opfn,
                                     const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &)
@@ -5561,7 +5561,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     };
 
     // perform loop over regular index k and reducing index m for N operands (counting the output)
-    template<class ElemType, size_t N, typename OPFN, bool vectorizable, int m, int k>
+    template<class ElemType, typename OPFN, size_t N, bool vectorizable, int m, int k>
     struct TensorOpIteration
     {
         static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
@@ -5575,7 +5575,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;)
             {
                 // need to descend into one loop deeper
-                TensorOpIteration<ElemType, N, OPFN, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                TensorOpIteration<ElemType, OPFN, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
                 // advance the pointers
                 for (size_t i = 0; i < N; i++)
                     pointers[i] += strides[i];
@@ -5583,37 +5583,72 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     };
 
-    // Special version: All innermost strides are 1, and there is no further reduction. Compiler can use SSE.
-    // This is a very common case, e.g. computing the Sigmoid.
-    template<class ElemType, size_t N, typename OPFN>
-    struct TensorOpIteration<ElemType, N, OPFN, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
+    // Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
+    // This is a very common case, e.g. adding vectors or computing the Sigmoid.
+    template<class ElemType, typename OPFN>
+    struct TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
     {
-        static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
-                                const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
-                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
+        static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 3> & regularStrides,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 3> & reducingStrides)
         {
+            ElemType* pa = pointers[0];
+            ElemType* pb = pointers[1];
+            ElemType* pc = pointers[2];
             size_t K = regularOpDims[0];
+            // special-case beta and alpha to allow the compiler to short-circuit it
+            if (beta != 0)
 #pragma omp parallel for
-            for (int k = 0; k < (int)K; k++)
-            {
-                // need to descend into one loop deeper
-                TensorOpIteration<ElemType, N, OPFN, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-                // advance the pointers
-                for (size_t i = 0; i < N; i++)
-                    pointers[i] += 1;       // instead of strides[i];
-            }
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, array<ElemType*, 3> { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            else if (alpha != 1)
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            else
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            // TODO: somehow this does not use 4-way parallelism with SSE (VS 2013), and the signedness of k (required for omp) causes an extra sign-extend
+            // TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
+        }
+    };
+    // and unary
+    template<class ElemType, typename OPFN>
+    struct TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, 0/*innermost loop*/>
+    {
+        static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN & opfn,
+                                const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, 2> & regularStrides,
+                                const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, 2> & reducingStrides)
+        {
+            ElemType* pa = pointers[0];
+            ElemType* pb = pointers[1];
+            size_t K = regularOpDims[0];
+            // special-case beta and alpha to allow the compiler to short-circuit it
+            if (beta != 0)
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(beta, array<ElemType*, 2> { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            else if (alpha != 1)
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 2> { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            else
+#pragma omp parallel for
+                for (int k = 0; k < (int)K; k++)
+                    TensorOpIteration<ElemType, OPFN, 2, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 2> { pa + k, pb + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
         }
     };
 
-    template<class ElemType, size_t N, typename OPFN, bool vectorizable, int m>
-    struct TensorOpIteration<ElemType, N, OPFN, vectorizable, m, -1>
+    template<class ElemType, typename OPFN, size_t N, bool vectorizable, int m>
+    struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
     {
         static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN & opfn,
                                 const std::vector<size_t> &, const std::array<std::vector<ptrdiff_t>, N> &,
                                 const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
         {
             // we are at element level for the result: perform the op (there may still be reduction)
-            ElemType val = alpha * TensorOpReduction<ElemType, N, OPFN, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
+            ElemType val = alpha * TensorOpReduction<ElemType, OPFN, N, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
             // combine with previous value in target matrix, then write it out
             auto * pout = pointers.back();
             if (beta != 0)
@@ -5624,7 +5659,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     };
 
     // tensor operation with k+1 dimensions (-1 means scalar)
-    template<class ElemType, size_t N, typename OPFN, int k>
+    template<class ElemType, typename OPFN, size_t N, int k>
     static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N> & pointers, ElemType alpha, const OPFN & opfn,
                                         const std::vector<size_t> & regularOpDims,  const std::array<std::vector<ptrdiff_t>, N> & regularStrides,
                                         const std::vector<size_t> & reducingOpDims, const std::array<std::vector<ptrdiff_t>, N> & reducingStrides)
@@ -5632,8 +5667,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         size_t dims = reducingOpDims.size();
         switch (dims)
         {
-        case 2: return TensorOpIteration<ElemType, N, OPFN, false/*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 1: return TensorOpIteration<ElemType, N, OPFN, false/*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 2: return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
         case 0:
             {
                 // if all leading dimensions are 1, we can let the compiler do some unrolling
@@ -5641,9 +5676,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (size_t i = 0; i < N; i++)
                     leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
                 if (leadingAllOne)      // special version that uses a hard-coded increment of 1 for all leading dimensions
-                    return TensorOpIteration<ElemType, N, OPFN, true/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                    return TensorOpIteration<ElemType, OPFN, N, true/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
                 else
-                    return TensorOpIteration<ElemType, N, OPFN, false/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                    return TensorOpIteration<ElemType, OPFN, N, false/*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
             }
         default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims);
         }
@@ -5662,11 +5697,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         size_t dims = regularOpDims.size();
         switch (dims)
         {
-        case 4: return TensorOpWithRegularLoop<ElemType, N, OPFN, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 3: return TensorOpWithRegularLoop<ElemType, N, OPFN, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 2: return TensorOpWithRegularLoop<ElemType, N, OPFN, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 1: return TensorOpWithRegularLoop<ElemType, N, OPFN, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-        case 0: return TensorOpWithRegularLoop<ElemType, N, OPFN, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 4: return TensorOpWithRegularLoop<ElemType, OPFN, N, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 3: return TensorOpWithRegularLoop<ElemType, OPFN, N, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 2: return TensorOpWithRegularLoop<ElemType, OPFN, N, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 1: return TensorOpWithRegularLoop<ElemType, OPFN, N, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        case 0: return TensorOpWithRegularLoop<ElemType, OPFN, N, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
         default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
         }
     }
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index e5626cd81..7ee05770b 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -230,10 +230,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         Matrix<ElemType> m2(-1);
         Matrix<ElemType> m3(-1);
         {
-            m1.SetValue(2, 3, { 1, 2, 3,
-                                14, 15, 6 });
-            m2.SetValue(2, 1, { 42,
-                                13 });
+            m1.SetValue(5, 3, { 1, 2, 3,
+                                14, 15, 6,
+                                4, 5, 16,
+                                41, 5, 1,
+                                1.8, 4.5, 7 });
+            m2.SetValue(5, 1, { 42,
+                                13,
+                                1968,
+                                3.1415f,
+                                7 });
 
             m3.Resize(m1);
 
@@ -256,7 +262,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m3.Print();
 
             // reduction over columns
-            m3.Resize(2, 1);
+            m3.Resize(5, 1);
             TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1);
             m3.Print();
 

From cf7fc9fe29ef4185f67ff6f8cd37b19c4f099d8d Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Fri, 18 Dec 2015 15:17:49 -0800
Subject: [PATCH 18/19] Updated baselines for cuDNN.

---
 .../QuickE2E/baseline.windows.debug.gpu.txt   | 2670 +++++------------
 .../QuickE2E/baseline.windows.release.gpu.txt | 2554 +++++-----------
 2 files changed, 1552 insertions(+), 3672 deletions(-)

diff --git a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt
index dafd81007..a0f43cc3a 100644
--- a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt
+++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt
@@ -1,253 +1,245 @@
 -------------------------------------------------------------------
 Build info: 
 
-		Built time: Nov 23 2015 10:00:15
-		Last modified date: Mon Nov 23 09:45:21 2015
-		Built by alexeyk on alexey-rz           
-		Build Path: C:\src\cntk\MachineLearning\CNTK\
+		Built time: Dec 18 2015 15:12:36
+		Last modified date: Wed Dec 16 11:33:30 2015
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Build Branch: 
+		Build SHA1: 
+		Built by alexeyk on z840-01           
+		Build Path: C:\src\cntk\Source\CNTK\
 -------------------------------------------------------------------
-running on alexey-rz at 2015/11/23 10:07:45
+running on z840-01 at 2015/12/18 15:13:39
 command line: 
-C:\src\cntk\x64\Debug\CNTK.exe configFile=C:\src\cntk\Tests\Image\QuickE2E\cntk.config RunDir=C:\src\cntk\Tests\Image\_run DataDir=C:\src\cntk\Tests\Image\Data ConfigDir=C:\src\cntk\Tests\Image\QuickE2E DeviceId=0 
+C:\src\cntk\x64\Debug\CNTK.exe configFile=QuickE2E\cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_out\gpu.txt 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=$DeviceId$
-ndlMacros=$ConfigDir$/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=$RunDir$/models/cntk.dnn
-    deviceId=$DeviceId$
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=$ConfigDir$/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = $DeviceId$
+ndlMacros = "$ConfigDir$/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=$RunDir$/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=$ConfigDir$/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-RunDir=C:\src\cntk\Tests\Image\_run
-DataDir=C:\src\cntk\Tests\Image\Data
-ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_out\gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=0
-ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = Auto
+ndlMacros = "QuickE2E/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-RunDir=C:\src\cntk\Tests\Image\_run
-DataDir=C:\src\cntk\Tests\Image\Data
-ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_out\gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-configparameters: cntk.config:command=Train:Test
-configparameters: cntk.config:ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-configparameters: cntk.config:DataDir=C:\src\cntk\Tests\Image\Data
-configparameters: cntk.config:deviceId=0
-configparameters: cntk.config:ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl
+configparameters: cntk.config:command=train:test
+configparameters: cntk.config:ConfigDir=QuickE2E
+configparameters: cntk.config:DataDir=Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl
+configparameters: cntk.config:numCPUThreads=8
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=C:\src\cntk\Tests\Image\_run
-configparameters: cntk.config:Test=[
-    action=test
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
+configparameters: cntk.config:RunDir=_out
+configparameters: cntk.config:stderr=_out\gpu.txt
+configparameters: cntk.config:test=[
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
-configparameters: cntk.config:Train=[
-    action=train
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+configparameters: cntk.config:train=[
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-command: Train Test 
+command: train test 
 precision = float
-CNTKModelPath: C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-CNTKCommandTrainInfo: Train : 12
+Using 8 CPU threads
+CNTKModelPath: _out/models/cntk.dnn
+CNTKCommandTrainInfo: train : 12
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12
-CNTKCommandTrainBegin: Train
+CNTKCommandTrainBegin: train
+LockDevice: Locked GPU 0 to test availability.
+LockDevice: Unlocked GPU 0 after testing.
+LockDevice: Locked GPU 1 to test availability.
+LockDevice: Unlocked GPU 1 after testing.
+LockDevice: Locked GPU 2 to test availability.
+LockDevice: Unlocked GPU 2 after testing.
+LockDevice: Locked GPU 0 for exclusive use.
 NDLBuilder Using GPU 0
-reading uci file C:\src\cntk\Tests\Image\Data/Train.txt
+Reading UCI file Data/Train.txt
+Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax
+	err = ErrorPrediction
+	outputNodes.z = Plus
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
 
 
-Allocating matrices for forward propagation.
-
-
-Printing Gradient Computation Node Order ... 
-
-CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 1], OutputNodes.z[0, 0])
-OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1])
-OutputNodes.b[10, 1] = LearnableParameter
-OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0])
-h1.y[0, 0] = Sigmoid(h1.z[0, 0])
-h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1])
-h1.b[128, 1] = LearnableParameter
-h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0])
-pool2[0, 0] = AveragePooling(conv2_act.act[0, 0])
-conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0])
-conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1])
-conv2_act.convB[32, 1] = LearnableParameter
-conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0])
-pool1[0, 0] = MaxPooling(conv1_act.act[0, 0])
-conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0])
-conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1])
-conv1_act.convB[16, 1] = LearnableParameter
-conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0])
-featScaled[0, 0] = Scale(featScale[1, 1], features[784, 1])
-features[784, 1] = InputValue
-featScale[1, 1] = LearnableParameter
-conv1_act.convW[16, 25] = LearnableParameter
-conv2_act.convW[32, 400] = LearnableParameter
-h1.W[128, 512] = LearnableParameter
-OutputNodes.W[10, 128] = LearnableParameter
-labels[10, 1] = InputValue
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -256,27 +248,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node ce. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -285,27 +277,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -314,31 +306,30 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -347,27 +338,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -376,27 +367,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -405,30 +396,29 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -437,25 +427,25 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -464,25 +454,25 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node OutputNodes.z, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -491,301 +481,39 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-SetUniformRandomValue (GPU): creating curand object with seed 1
 SGD using GPU 0.
-GetTrainCriterionNodes  ...
-GetEvalCriterionNodes  ...
+
+Training criterion node(s):
+	ce = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	err = ErrorPrediction
 
 
-Allocating matrices for gradient computing
+Allocating matrices for forward and/or backward propagation.
 No PreCompute nodes found, skipping PreCompute step
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.000000 
+Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting at epoch 0 counting lines to determine record count
 
  1000 records found
@@ -793,148 +521,126 @@ starting epoch 0 at record count 0, and file position 0
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 15, 33, ...
- Epoch[ 1 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.37891785; EvalErr[0]PerSample = 0.93000000; TotalTime = 0.19572s; TotalTimePerSample = 1.95719ms; SamplesPerSecond = 510
-Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3789177; EvalErrPerSample = 0.93000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.226218
-Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 15, 33, ...
+ Epoch[ 1 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.34708694; EvalErr[0]PerSample = 0.92000000; TotalTime = 0.2483s; SamplesPerSecond = 402.8
+Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3470869; EvalErrPerSample = 0.91999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.285798
+Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 1 at record count 100, and file position 100
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 26 retries for 100 elements (26.0%) to ensure window condition
-randomordering: recached sequence for seed 1: 20, 26, ...
- Epoch[ 2 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.37942505; EvalErr[0]PerSample = 0.91000000; TotalTime = 0.08022s; TotalTimePerSample = 0.80224ms; SamplesPerSecond = 1246
-Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.379425; EvalErrPerSample = 0.90999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.080724
-Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 26 retries for 100 elements (26.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 1: 20, 26, ...
+ Epoch[ 2 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.29444092; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.0975s; SamplesPerSecond = 1025.7
+Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.294441; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100328
+Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 2 at record count 200, and file position 200
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 28 retries for 100 elements (28.0%) to ensure window condition
-randomordering: recached sequence for seed 2: 4, 35, ...
- Epoch[ 3 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.32070969; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.08246s; TotalTimePerSample = 0.82460ms; SamplesPerSecond = 1212
-Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.3207097; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.083
-Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 28 retries for 100 elements (28.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 2: 4, 35, ...
+ Epoch[ 3 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.13786560; EvalErr[0]PerSample = 0.73000000; TotalTime = 0.0973s; SamplesPerSecond = 1027.9
+Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.1378655; EvalErrPerSample = 0.72999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100033
+Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 3 at record count 300, and file position 300
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 17 retries for 100 elements (17.0%) to ensure window condition
-randomordering: recached sequence for seed 3: 28, 7, ...
- Epoch[ 4 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.36838959; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.08074s; TotalTimePerSample = 0.80741ms; SamplesPerSecond = 1238
-Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.3683896; EvalErrPerSample = 0.89999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.081265
-Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 3: 28, 7, ...
+ Epoch[ 4 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.03929321; EvalErr[0]PerSample = 0.75000000; TotalTime = 0.0854s; SamplesPerSecond = 1171.3
+Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.0392931; EvalErrPerSample = 0.75; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.08801
+Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 4 at record count 400, and file position 400
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 15 retries for 100 elements (15.0%) to ensure window condition
-randomordering: recached sequence for seed 4: 5, 36, ...
- Epoch[ 5 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.32354156; EvalErr[0]PerSample = 0.84000000; TotalTime = 0.07892s; TotalTimePerSample = 0.78921ms; SamplesPerSecond = 1267
-Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 2.3235414; EvalErrPerSample = 0.83999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.079374
-Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 15 retries for 100 elements (15.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 4: 5, 36, ...
+ Epoch[ 5 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.77985352; EvalErr[0]PerSample = 0.53000000; TotalTime = 0.0979s; SamplesPerSecond = 1021.3
+Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 1.7798535; EvalErrPerSample = 0.52999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100739
+Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 5 at record count 500, and file position 500
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 13 retries for 100 elements (13.0%) to ensure window condition
-randomordering: recached sequence for seed 5: 11, 48, ...
- Epoch[ 6 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.24672409; EvalErr[0]PerSample = 0.83000000; TotalTime = 0.07987s; TotalTimePerSample = 0.79865ms; SamplesPerSecond = 1252
-Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 2.2467241; EvalErrPerSample = 0.82999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.08033
-Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 5: 11, 48, ...
+ Epoch[ 6 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.49362656; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.0967s; SamplesPerSecond = 1033.8
+Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 1.4936265; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.09948
+Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 6 at record count 600, and file position 600
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 13 retries for 100 elements (13.0%) to ensure window condition
-randomordering: recached sequence for seed 6: 15, 3, ...
- Epoch[ 7 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.09912888; EvalErr[0]PerSample = 0.69000000; TotalTime = 0.07999s; TotalTimePerSample = 0.79993ms; SamplesPerSecond = 1250
-Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 2.0991287; EvalErrPerSample = 0.69; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.080483
-Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 6: 15, 3, ...
+ Epoch[ 7 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.17570114; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.0982s; SamplesPerSecond = 1018.8
+Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 1.1757011; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100967
+Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 7 at record count 700, and file position 700
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 22 retries for 100 elements (22.0%) to ensure window condition
-randomordering: recached sequence for seed 7: 9, 19, ...
- Epoch[ 8 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.01871979; EvalErr[0]PerSample = 0.61000000; TotalTime = 0.07961s; TotalTimePerSample = 0.79607ms; SamplesPerSecond = 1256
-Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 2.0187197; EvalErrPerSample = 0.61000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.080087
-Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 22 retries for 100 elements (22.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 7: 9, 19, ...
+ Epoch[ 8 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.98662323; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.0825s; SamplesPerSecond = 1212.2
+Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 0.98662323; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.08482
+Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 8 at record count 800, and file position 800
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 16 retries for 100 elements (16.0%) to ensure window condition
-randomordering: recached sequence for seed 8: 8, 5, ...
- Epoch[ 9 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.75549896; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.08258s; TotalTimePerSample = 0.82578ms; SamplesPerSecond = 1210
-Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 1.7554989; EvalErrPerSample = 0.34999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.083038
-Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 16 retries for 100 elements (16.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 8: 8, 5, ...
+ Epoch[ 9 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.72003891; EvalErr[0]PerSample = 0.01000000; TotalTime = 0.0983s; SamplesPerSecond = 1017.6
+Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 0.72003889; EvalErrPerSample = 0.0099999998; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.101038
+Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 9 at record count 900, and file position 900
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 16 retries for 100 elements (16.0%) to ensure window condition
-randomordering: recached sequence for seed 9: 7, 10, ...
- Epoch[10 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.64107086; EvalErr[0]PerSample = 0.39000000; TotalTime = 0.09024s; TotalTimePerSample = 0.90243ms; SamplesPerSecond = 1108
-Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 1.6410708; EvalErrPerSample = 0.38999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.090849
-Starting Epoch 11: learning rate per sample = 0.005000  effective momentum = 0.700000 
+RandomOrdering: 16 retries for 100 elements (16.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 9: 7, 10, ...
+ Epoch[10 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.60043060; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0839s; SamplesPerSecond = 1191.9
+Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 0.60043061; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.086226
+Starting Epoch 11: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting epoch 10 at record count 1000, and file position 0
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 22 retries for 100 elements (22.0%) to ensure window condition
-randomordering: recached sequence for seed 10: 13, 22, ...
- Epoch[11 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.30029129; EvalErr[0]PerSample = 0.12000000; TotalTime = 0.08305s; TotalTimePerSample = 0.83050ms; SamplesPerSecond = 1204
-Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 1.3002913; EvalErrPerSample = 0.12; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.083644
-Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000 
+RandomOrdering: 22 retries for 100 elements (22.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 10: 13, 22, ...
+ Epoch[11 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.42560421; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0980s; SamplesPerSecond = 1020.2
+Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 0.42560419; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100689
+Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting epoch 11 at record count 1100, and file position 100
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 11: 6, 31, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.01696381; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.08059s; TotalTimePerSample = 0.80586ms; SamplesPerSecond = 1240
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0169638; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.081069
-CNTKCommandTrainEnd: Train
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 11: 6, 31, ...
+ Epoch[12 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.33292500; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0981s; SamplesPerSecond = 1019.0
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.33292499; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.101064
+CNTKCommandTrainEnd: train
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax
+	outputNodes.z = Plus
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
-
-
-Printing Gradient Computation Node Order ... 
-
-CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0])
-OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1])
-OutputNodes.b[10, 1] = LearnableParameter
-OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0])
-h1.y[0, 0] = Sigmoid(h1.z[0, 0])
-h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1])
-h1.b[128, 1] = LearnableParameter
-h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0])
-pool2[0, 0] = AveragePooling(conv2_act.act[0, 0])
-conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0])
-conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1])
-conv2_act.convB[32, 1] = LearnableParameter
-conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0])
-pool1[0, 0] = MaxPooling(conv1_act.act[0, 0])
-conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0])
-conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1])
-conv1_act.convB[16, 1] = LearnableParameter
-conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0])
-featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0])
-features[784, 0] = InputValue
-featScale[1, 1] = LearnableParameter
-conv1_act.convW[16, 25] = LearnableParameter
-conv2_act.convW[32, 400] = LearnableParameter
-h1.W[128, 512] = LearnableParameter
-OutputNodes.W[10, 128] = LearnableParameter
-labels[10, 0] = InputValue
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -955,15 +661,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node ce. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -984,15 +690,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1013,19 +719,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1046,15 +750,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1075,15 +777,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1104,189 +804,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1307,15 +835,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1336,15 +864,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1365,364 +893,268 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
 starting epoch 0 at record count 0, and file position 0
 already there from last epoch
-randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 15, 33, ...
-Final Results: Minibatch[1-1]: Samples Seen = 100    Err: ErrorPrediction/Sample = 0    CE: CrossEntropyWithSoftmax/Sample = 0.87062637    Perplexity = 2.3884064    
+RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 15, 33, ...
+Final Results: Minibatch[1-1]: Samples Seen = 100    err: ErrorPrediction/Sample = 0    ce: CrossEntropyWithSoftmax/Sample = 0.29111851    Perplexity = 1.3379231    
 COMPLETED
 === Deleting last epoch data
 ==== Re-running from checkpoint
 -------------------------------------------------------------------
 Build info: 
 
-        Built time: Nov 23 2015 10:00:15
-        Last modified date: Mon Nov 23 09:45:21 2015
-        Built by alexeyk on alexey-rz           
-        Build Path: C:\src\cntk\MachineLearning\CNTK\
-        CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Built time: Dec 18 2015 15:12:36
+		Last modified date: Wed Dec 16 11:33:30 2015
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Build Branch: 
+		Build SHA1: 
+		Built by alexeyk on z840-01           
+		Build Path: C:\src\cntk\Source\CNTK\
 -------------------------------------------------------------------
-running on alexey-rz at 2015/11/23 10:35:40
+running on z840-01 at 2015/12/18 15:13:59
 command line: 
-C:\src\cntk\x64\Debug\CNTK.exe configFile=C:\src\cntk\Tests\Image\QuickE2E\cntk.config RunDir=C:\src\cntk\Tests\Image\_run DataDir=C:\src\cntk\Tests\Image\Data ConfigDir=C:\src\cntk\Tests\Image\QuickE2E DeviceId=0 
+C:\src\cntk\x64\Debug\CNTK.exe configFile=QuickE2E\cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_out\gpu.txt 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=$DeviceId$
-ndlMacros=$ConfigDir$/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=$RunDir$/models/cntk.dnn
-    deviceId=$DeviceId$
-    traceLevel=1
-        NDLNetworkBuilder=[
-                networkDescription=$ConfigDir$/Convolution.ndl
-        ]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = $DeviceId$
+ndlMacros = "$ConfigDir$/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=$RunDir$/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=$ConfigDir$/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-RunDir=C:\src\cntk\Tests\Image\_run
-DataDir=C:\src\cntk\Tests\Image\Data
-ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_out\gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=0
-ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-        NDLNetworkBuilder=[
-                networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
-        ]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = Auto
+ndlMacros = "QuickE2E/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-RunDir=C:\src\cntk\Tests\Image\_run
-DataDir=C:\src\cntk\Tests\Image\Data
-ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_out\gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-configparameters: cntk.config:command=Train:Test
-configparameters: cntk.config:ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-configparameters: cntk.config:DataDir=C:\src\cntk\Tests\Image\Data
-configparameters: cntk.config:deviceId=0
-configparameters: cntk.config:ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl
+configparameters: cntk.config:command=train:test
+configparameters: cntk.config:ConfigDir=QuickE2E
+configparameters: cntk.config:DataDir=Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl
+configparameters: cntk.config:numCPUThreads=8
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=C:\src\cntk\Tests\Image\_run
-configparameters: cntk.config:Test=[
-    action=test
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
+configparameters: cntk.config:RunDir=_out
+configparameters: cntk.config:stderr=_out\gpu.txt
+configparameters: cntk.config:test=[
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
-configparameters: cntk.config:Train=[
-    action=train
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-        NDLNetworkBuilder=[
-                networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
-        ]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+configparameters: cntk.config:train=[
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-command: Train Test 
+command: train test 
 precision = float
-CNTKModelPath: C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-CNTKCommandTrainInfo: Train : 12
+Using 8 CPU threads
+CNTKModelPath: _out/models/cntk.dnn
+CNTKCommandTrainInfo: train : 12
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12
-CNTKCommandTrainBegin: Train
+CNTKCommandTrainBegin: train
+LockDevice: Locked GPU 0 to test availability.
+LockDevice: Unlocked GPU 0 after testing.
+LockDevice: Locked GPU 1 to test availability.
+LockDevice: Unlocked GPU 1 after testing.
+LockDevice: Locked GPU 2 to test availability.
+LockDevice: Unlocked GPU 2 after testing.
+LockDevice: Locked GPU 0 for exclusive use.
 NDLBuilder Using GPU 0
-reading uci file C:\src\cntk\Tests\Image\Data/Train.txt
-Starting from checkpoint. Load Network From File C:\src\cntk\Tests\Image\_run/models/cntk.dnn.11.
+Reading UCI file Data/Train.txt
+Starting from checkpoint. Load Network From File _out/models/cntk.dnn.11.
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax
+	outputNodes.z = Plus
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
-
-
-Printing Gradient Computation Node Order ... 
-
-CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0])
-OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1])
-OutputNodes.b[10, 1] = LearnableParameter
-OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0])
-h1.y[0, 0] = Sigmoid(h1.z[0, 0])
-h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1])
-h1.b[128, 1] = LearnableParameter
-h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0])
-pool2[0, 0] = AveragePooling(conv2_act.act[0, 0])
-conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0])
-conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1])
-conv2_act.convB[32, 1] = LearnableParameter
-conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0])
-pool1[0, 0] = MaxPooling(conv1_act.act[0, 0])
-conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0])
-conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1])
-conv1_act.convB[16, 1] = LearnableParameter
-conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0])
-featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0])
-features[784, 0] = InputValue
-featScale[1, 1] = LearnableParameter
-conv1_act.convW[16, 25] = LearnableParameter
-conv2_act.convW[32, 400] = LearnableParameter
-h1.W[128, 512] = LearnableParameter
-OutputNodes.W[10, 128] = LearnableParameter
-labels[10, 0] = InputValue
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1743,15 +1175,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node ce. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1772,15 +1204,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1801,19 +1233,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1834,15 +1264,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1863,15 +1291,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1892,189 +1318,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2095,15 +1349,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2124,15 +1378,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2153,114 +1407,29 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
+Post-processing network complete.
 
 SGD using GPU 0.
-GetTrainCriterionNodes  ...
-GetEvalCriterionNodes  ...
+
+Training criterion node(s):
+	ce = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	err = ErrorPrediction
 
 
-Allocating matrices for gradient computing
+Allocating matrices for forward and/or backward propagation.
 No PreCompute nodes found, skipping PreCompute step
 Warning: checkpoint file is missing. learning parameters will be initialized from 0
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000 
+Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting at epoch 11 counting lines to determine record count
 
  1000 records found
@@ -2268,49 +1437,27 @@ starting epoch 11 at record count 1100, and file position 100
 reading from record 0 to 100 to be positioned properly for epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 11: 6, 31, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.03456436; EvalErr[0]PerSample = 0.02000000; TotalTime = 0.19100s; TotalTimePerSample = 1.90999ms; SamplesPerSecond = 523
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0345644; EvalErrPerSample = 0.02; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.223405
-CNTKCommandTrainEnd: Train
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 11: 6, 31, ...
+ Epoch[12 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.33976147; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.2483s; SamplesPerSecond = 402.8
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.33976147; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.283086
+CNTKCommandTrainEnd: train
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax
+	outputNodes.z = Plus
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
-
-
-Printing Gradient Computation Node Order ... 
-
-CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0])
-OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1])
-OutputNodes.b[10, 1] = LearnableParameter
-OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0])
-h1.y[0, 0] = Sigmoid(h1.z[0, 0])
-h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1])
-h1.b[128, 1] = LearnableParameter
-h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0])
-pool2[0, 0] = AveragePooling(conv2_act.act[0, 0])
-conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0])
-conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1])
-conv2_act.convB[32, 1] = LearnableParameter
-conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0])
-pool1[0, 0] = MaxPooling(conv1_act.act[0, 0])
-conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0])
-conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1])
-conv1_act.convB[16, 1] = LearnableParameter
-conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0])
-featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0])
-features[784, 0] = InputValue
-featScale[1, 1] = LearnableParameter
-conv1_act.convW[16, 25] = LearnableParameter
-conv2_act.convW[32, 400] = LearnableParameter
-h1.W[128, 512] = LearnableParameter
-OutputNodes.W[10, 128] = LearnableParameter
-labels[10, 0] = InputValue
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2331,15 +1478,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node ce. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2360,15 +1507,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2389,19 +1536,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2422,15 +1567,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2451,15 +1594,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2480,189 +1621,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2683,15 +1652,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2712,15 +1681,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2741,108 +1710,21 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
 starting epoch 0 at record count 0, and file position 0
 already there from last epoch
-randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 15, 33, ...
-Final Results: Minibatch[1-1]: Samples Seen = 100    Err: ErrorPrediction/Sample = 0    CE: CrossEntropyWithSoftmax/Sample = 0.90504265    Perplexity = 2.4720373    
+RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 15, 33, ...
+Final Results: Minibatch[1-1]: Samples Seen = 100    err: ErrorPrediction/Sample = 0    ce: CrossEntropyWithSoftmax/Sample = 0.30440025    Perplexity = 1.3558116    
 COMPLETED
diff --git a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt
index 81ab0c056..c34022c07 100644
--- a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt
+++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt
@@ -1,224 +1,245 @@
 -------------------------------------------------------------------
 Build info: 
 
-		Built time: Nov 23 2015 09:55:26
-		Last modified date: Mon Nov 23 09:45:21 2015
-		Built by alexeyk on alexey-rz           
-		Build Path: C:\src\cntk\MachineLearning\CNTK\
+		Built time: Dec 18 2015 14:55:05
+		Last modified date: Wed Dec 16 11:33:30 2015
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Build Branch: 
+		Build SHA1: 
+		Built by alexeyk on z840-01           
+		Build Path: C:\src\cntk\Source\CNTK\
 -------------------------------------------------------------------
-running on alexey-rz at 2015/11/23 10:09:12
+running on z840-01 at 2015/12/18 14:58:21
 command line: 
-C:\src\cntk\x64\Release\CNTK.exe configFile=C:\src\cntk\Tests\Image\QuickE2E\cntk.config RunDir=C:\src\cntk\Tests\Image\_run DataDir=C:\src\cntk\Tests\Image\Data ConfigDir=C:\src\cntk\Tests\Image\QuickE2E DeviceId=0 
+C:\src\cntk\x64\Release\CNTK.exe configFile=QuickE2E\cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_out\gpu.txt 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=$DeviceId$
-ndlMacros=$ConfigDir$/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=$RunDir$/models/cntk.dnn
-    deviceId=$DeviceId$
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=$ConfigDir$/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = $DeviceId$
+ndlMacros = "$ConfigDir$/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=$RunDir$/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=$ConfigDir$/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-RunDir=C:\src\cntk\Tests\Image\_run
-DataDir=C:\src\cntk\Tests\Image\Data
-ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_out\gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=0
-ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = Auto
+ndlMacros = "QuickE2E/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-RunDir=C:\src\cntk\Tests\Image\_run
-DataDir=C:\src\cntk\Tests\Image\Data
-ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_out\gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-configparameters: cntk.config:command=Train:Test
-configparameters: cntk.config:ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-configparameters: cntk.config:DataDir=C:\src\cntk\Tests\Image\Data
-configparameters: cntk.config:deviceId=0
-configparameters: cntk.config:ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl
+configparameters: cntk.config:command=train:test
+configparameters: cntk.config:ConfigDir=QuickE2E
+configparameters: cntk.config:DataDir=Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl
+configparameters: cntk.config:numCPUThreads=8
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=C:\src\cntk\Tests\Image\_run
-configparameters: cntk.config:Test=[
-    action=test
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
+configparameters: cntk.config:RunDir=_out
+configparameters: cntk.config:stderr=_out\gpu.txt
+configparameters: cntk.config:test=[
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
-configparameters: cntk.config:Train=[
-    action=train
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+configparameters: cntk.config:train=[
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-command: Train Test 
+command: train test 
 precision = float
-CNTKModelPath: C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-CNTKCommandTrainInfo: Train : 12
+Using 8 CPU threads
+CNTKModelPath: _out/models/cntk.dnn
+CNTKCommandTrainInfo: train : 12
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12
-CNTKCommandTrainBegin: Train
+CNTKCommandTrainBegin: train
+LockDevice: Locked GPU 0 to test availability.
+LockDevice: Unlocked GPU 0 after testing.
+LockDevice: Locked GPU 1 to test availability.
+LockDevice: Unlocked GPU 1 after testing.
+LockDevice: Locked GPU 2 to test availability.
+LockDevice: Unlocked GPU 2 after testing.
+LockDevice: Locked GPU 0 for exclusive use.
 NDLBuilder Using GPU 0
-reading uci file C:\src\cntk\Tests\Image\Data/Train.txt
+Reading UCI file Data/Train.txt
+Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+
+Post-processing network...
+
+3 roots:
+	err = ErrorPrediction
+	outputNodes.z = Plus
+	ce = CrossEntropyWithSoftmax
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
 
 
-Allocating matrices for forward propagation.
-
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -227,27 +248,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node err. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -256,27 +277,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -285,31 +306,29 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -318,27 +337,25 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -347,27 +364,25 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -376,201 +391,29 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -579,27 +422,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -608,27 +451,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -637,126 +480,40 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-SetUniformRandomValue (GPU): creating curand object with seed 1
 SGD using GPU 0.
-GetTrainCriterionNodes  ...
-GetEvalCriterionNodes  ...
+
+Training criterion node(s):
+	ce = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	err = ErrorPrediction
 
 
-Allocating matrices for gradient computing
+Allocating matrices for forward and/or backward propagation.
 No PreCompute nodes found, skipping PreCompute step
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.000000 
+Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting at epoch 0 counting lines to determine record count
 
  1000 records found
@@ -764,119 +521,125 @@ starting epoch 0 at record count 0, and file position 0
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 15, 33, ...
- Epoch[ 1 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.37891785; EvalErr[0]PerSample = 0.93000000; TotalTime = 0.88819s; TotalTimePerSample = 8.88193ms; SamplesPerSecond = 112
-Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3789177; EvalErrPerSample = 0.93000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=1.054592
-Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 15, 33, ...
+ Epoch[ 1 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.34708694; EvalErr[0]PerSample = 0.92000000; TotalTime = 0.4657s; SamplesPerSecond = 214.7
+Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3470869; EvalErrPerSample = 0.91999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.484419
+Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 1 at record count 100, and file position 100
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 26 retries for 100 elements (26.0%) to ensure window condition
-randomordering: recached sequence for seed 1: 20, 26, ...
- Epoch[ 2 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.37942505; EvalErr[0]PerSample = 0.91000000; TotalTime = 0.03505s; TotalTimePerSample = 0.35045ms; SamplesPerSecond = 2853
-Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.379425; EvalErrPerSample = 0.90999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035368
-Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 26 retries for 100 elements (26.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 1: 20, 26, ...
+ Epoch[ 2 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.29444092; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.0227s; SamplesPerSecond = 4400.1
+Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.294441; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.024503
+Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 2 at record count 200, and file position 200
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 28 retries for 100 elements (28.0%) to ensure window condition
-randomordering: recached sequence for seed 2: 4, 35, ...
- Epoch[ 3 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.32070969; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.03474s; TotalTimePerSample = 0.34742ms; SamplesPerSecond = 2878
-Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.3207097; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035036
-Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 28 retries for 100 elements (28.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 2: 4, 35, ...
+ Epoch[ 3 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.13786591; EvalErr[0]PerSample = 0.73000000; TotalTime = 0.0224s; SamplesPerSecond = 4464.7
+Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.1378658; EvalErrPerSample = 0.72999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.024125
+Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 3 at record count 300, and file position 300
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 17 retries for 100 elements (17.0%) to ensure window condition
-randomordering: recached sequence for seed 3: 28, 7, ...
- Epoch[ 4 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.36838959; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.03532s; TotalTimePerSample = 0.35322ms; SamplesPerSecond = 2831
-Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.3683896; EvalErrPerSample = 0.89999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.03561
-Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 3: 28, 7, ...
+ Epoch[ 4 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.03929321; EvalErr[0]PerSample = 0.75000000; TotalTime = 0.0230s; SamplesPerSecond = 4355.8
+Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.0392931; EvalErrPerSample = 0.75; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.024759
+Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 4 at record count 400, and file position 400
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 15 retries for 100 elements (15.0%) to ensure window condition
-randomordering: recached sequence for seed 4: 5, 36, ...
- Epoch[ 5 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.32354156; EvalErr[0]PerSample = 0.84000000; TotalTime = 0.03528s; TotalTimePerSample = 0.35281ms; SamplesPerSecond = 2834
-Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 2.3235414; EvalErrPerSample = 0.83999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035547
-Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 15 retries for 100 elements (15.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 4: 5, 36, ...
+ Epoch[ 5 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.77985336; EvalErr[0]PerSample = 0.53000000; TotalTime = 0.0193s; SamplesPerSecond = 5174.4
+Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 1.7798533; EvalErrPerSample = 0.52999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020804
+Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 5 at record count 500, and file position 500
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 13 retries for 100 elements (13.0%) to ensure window condition
-randomordering: recached sequence for seed 5: 11, 48, ...
- Epoch[ 6 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.24672409; EvalErr[0]PerSample = 0.83000000; TotalTime = 0.03495s; TotalTimePerSample = 0.34947ms; SamplesPerSecond = 2861
-Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 2.2467241; EvalErrPerSample = 0.82999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035271
-Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 5: 11, 48, ...
+ Epoch[ 6 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.49362656; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.0194s; SamplesPerSecond = 5161.0
+Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 1.4936265; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020921
+Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 6 at record count 600, and file position 600
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 13 retries for 100 elements (13.0%) to ensure window condition
-randomordering: recached sequence for seed 6: 15, 3, ...
- Epoch[ 7 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.09912888; EvalErr[0]PerSample = 0.69000000; TotalTime = 0.03487s; TotalTimePerSample = 0.34871ms; SamplesPerSecond = 2867
-Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 2.0991287; EvalErrPerSample = 0.69; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035159
-Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 6: 15, 3, ...
+ Epoch[ 7 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.17570114; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.0207s; SamplesPerSecond = 4830.0
+Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 1.1757011; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.02243
+Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 7 at record count 700, and file position 700
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 22 retries for 100 elements (22.0%) to ensure window condition
-randomordering: recached sequence for seed 7: 9, 19, ...
- Epoch[ 8 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.01871979; EvalErr[0]PerSample = 0.61000000; TotalTime = 0.03490s; TotalTimePerSample = 0.34905ms; SamplesPerSecond = 2864
-Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 2.0187197; EvalErrPerSample = 0.61000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035189
-Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 22 retries for 100 elements (22.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 7: 9, 19, ...
+ Epoch[ 8 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.98662323; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.0202s; SamplesPerSecond = 4952.7
+Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 0.98662323; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.021894
+Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 8 at record count 800, and file position 800
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 16 retries for 100 elements (16.0%) to ensure window condition
-randomordering: recached sequence for seed 8: 8, 5, ...
- Epoch[ 9 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.75549896; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.03488s; TotalTimePerSample = 0.34884ms; SamplesPerSecond = 2866
-Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 1.7554989; EvalErrPerSample = 0.34999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.03521
-Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 16 retries for 100 elements (16.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 8: 8, 5, ...
+ Epoch[ 9 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.72003899; EvalErr[0]PerSample = 0.01000000; TotalTime = 0.0202s; SamplesPerSecond = 4960.1
+Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 0.72003895; EvalErrPerSample = 0.0099999998; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.021856
+Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 9 at record count 900, and file position 900
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 16 retries for 100 elements (16.0%) to ensure window condition
-randomordering: recached sequence for seed 9: 7, 10, ...
- Epoch[10 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.64107086; EvalErr[0]PerSample = 0.39000000; TotalTime = 0.03478s; TotalTimePerSample = 0.34779ms; SamplesPerSecond = 2875
-Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 1.6410708; EvalErrPerSample = 0.38999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035064
-Starting Epoch 11: learning rate per sample = 0.005000  effective momentum = 0.700000 
+RandomOrdering: 16 retries for 100 elements (16.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 9: 7, 10, ...
+ Epoch[10 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.60043072; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0222s; SamplesPerSecond = 4494.4
+Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 0.60043073; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.023996
+Starting Epoch 11: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting epoch 10 at record count 1000, and file position 0
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 22 retries for 100 elements (22.0%) to ensure window condition
-randomordering: recached sequence for seed 10: 13, 22, ...
- Epoch[11 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.30029129; EvalErr[0]PerSample = 0.12000000; TotalTime = 0.03496s; TotalTimePerSample = 0.34960ms; SamplesPerSecond = 2860
-Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 1.3002913; EvalErrPerSample = 0.12; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035526
-Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000 
+RandomOrdering: 22 retries for 100 elements (22.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 10: 13, 22, ...
+ Epoch[11 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.42560429; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0216s; SamplesPerSecond = 4639.5
+Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 0.42560428; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.023399
+Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting epoch 11 at record count 1100, and file position 100
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 11: 6, 31, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.01696381; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.03480s; TotalTimePerSample = 0.34798ms; SamplesPerSecond = 2873
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0169638; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035119
-CNTKCommandTrainEnd: Train
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 11: 6, 31, ...
+ Epoch[12 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.33292500; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0197s; SamplesPerSecond = 5079.5
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.33292499; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.021406
+CNTKCommandTrainEnd: train
+
+Post-processing network...
+
+3 roots:
+	outputNodes.z = Plus
+	ce = CrossEntropyWithSoftmax
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -897,15 +660,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node outputNodes.z. 14 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -926,15 +687,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -955,280 +714,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node CE. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node CE, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1249,15 +745,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1278,15 +774,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1307,19 +803,18 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1340,15 +835,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1369,15 +864,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1398,244 +893,268 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
 starting epoch 0 at record count 0, and file position 0
 already there from last epoch
-randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 15, 33, ...
-Final Results: Minibatch[1-1]: Samples Seen = 100    Err: ErrorPrediction/Sample = 0    CE: CrossEntropyWithSoftmax/Sample = 0.87062637    Perplexity = 2.3884064    
+RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 15, 33, ...
+Final Results: Minibatch[1-1]: Samples Seen = 100    err: ErrorPrediction/Sample = 0    ce: CrossEntropyWithSoftmax/Sample = 0.29111847    Perplexity = 1.3379231    
 COMPLETED
 === Deleting last epoch data
 ==== Re-running from checkpoint
 -------------------------------------------------------------------
 Build info: 
 
-        Built time: Nov 23 2015 09:55:26
-        Last modified date: Mon Nov 23 09:45:21 2015
-        Built by alexeyk on alexey-rz           
-        Build Path: C:\src\cntk\MachineLearning\CNTK\
-        CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Built time: Dec 18 2015 14:55:05
+		Last modified date: Wed Dec 16 11:33:30 2015
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		Build Branch: 
+		Build SHA1: 
+		Built by alexeyk on z840-01           
+		Build Path: C:\src\cntk\Source\CNTK\
 -------------------------------------------------------------------
-running on alexey-rz at 2015/11/23 10:32:35
+running on z840-01 at 2015/12/18 15:06:14
 command line: 
-C:\src\cntk\x64\Release\CNTK.exe configFile=C:\src\cntk\Tests\Image\QuickE2E\cntk.config RunDir=C:\src\cntk\Tests\Image\_run DataDir=C:\src\cntk\Tests\Image\Data ConfigDir=C:\src\cntk\Tests\Image\QuickE2E DeviceId=0 
+C:\src\cntk\x64\Release\CNTK.exe configFile=QuickE2E\cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_out\gpu.txt 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=$DeviceId$
-ndlMacros=$ConfigDir$/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=$RunDir$/models/cntk.dnn
-    deviceId=$DeviceId$
-    traceLevel=1
-        NDLNetworkBuilder=[
-                networkDescription=$ConfigDir$/Convolution.ndl
-        ]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = $DeviceId$
+ndlMacros = "$ConfigDir$/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=$RunDir$/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=$ConfigDir$/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-RunDir=C:\src\cntk\Tests\Image\_run
-DataDir=C:\src\cntk\Tests\Image\Data
-ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_out\gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=0
-ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-        NDLNetworkBuilder=[
-                networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
-        ]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = Auto
+ndlMacros = "QuickE2E/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-RunDir=C:\src\cntk\Tests\Image\_run
-DataDir=C:\src\cntk\Tests\Image\Data
-ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_out\gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-configparameters: cntk.config:command=Train:Test
-configparameters: cntk.config:ConfigDir=C:\src\cntk\Tests\Image\QuickE2E
-configparameters: cntk.config:DataDir=C:\src\cntk\Tests\Image\Data
-configparameters: cntk.config:deviceId=0
-configparameters: cntk.config:ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl
+configparameters: cntk.config:command=train:test
+configparameters: cntk.config:ConfigDir=QuickE2E
+configparameters: cntk.config:DataDir=Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl
+configparameters: cntk.config:numCPUThreads=8
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=C:\src\cntk\Tests\Image\_run
-configparameters: cntk.config:Test=[
-    action=test
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
+configparameters: cntk.config:RunDir=_out
+configparameters: cntk.config:stderr=_out\gpu.txt
+configparameters: cntk.config:test=[
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
-configparameters: cntk.config:Train=[
-    action=train
-    modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-        NDLNetworkBuilder=[
-                networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl
-        ]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+configparameters: cntk.config:train=[
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=C:\src\cntk\Tests\Image\Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-command: Train Test 
+command: train test 
 precision = float
-CNTKModelPath: C:\src\cntk\Tests\Image\_run/models/cntk.dnn
-CNTKCommandTrainInfo: Train : 12
+Using 8 CPU threads
+CNTKModelPath: _out/models/cntk.dnn
+CNTKCommandTrainInfo: train : 12
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12
-CNTKCommandTrainBegin: Train
+CNTKCommandTrainBegin: train
+LockDevice: Locked GPU 0 to test availability.
+LockDevice: Unlocked GPU 0 after testing.
+LockDevice: Locked GPU 1 to test availability.
+LockDevice: Unlocked GPU 1 after testing.
+LockDevice: Locked GPU 2 to test availability.
+LockDevice: Unlocked GPU 2 after testing.
+LockDevice: Locked GPU 0 for exclusive use.
 NDLBuilder Using GPU 0
-reading uci file C:\src\cntk\Tests\Image\Data/Train.txt
-Starting from checkpoint. Load Network From File C:\src\cntk\Tests\Image\_run/models/cntk.dnn.11.
+Reading UCI file Data/Train.txt
+Starting from checkpoint. Load Network From File _out/models/cntk.dnn.11.
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax
+	outputNodes.z = Plus
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
-
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1656,15 +1175,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node ce. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1685,15 +1204,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1714,19 +1233,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1747,15 +1264,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1776,15 +1291,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1805,189 +1318,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2008,15 +1349,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2037,15 +1378,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2066,114 +1407,29 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
+Post-processing network complete.
 
 SGD using GPU 0.
-GetTrainCriterionNodes  ...
-GetEvalCriterionNodes  ...
+
+Training criterion node(s):
+	ce = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	err = ErrorPrediction
 
 
-Allocating matrices for gradient computing
+Allocating matrices for forward and/or backward propagation.
 No PreCompute nodes found, skipping PreCompute step
 Warning: checkpoint file is missing. learning parameters will be initialized from 0
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000 
+Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting at epoch 11 counting lines to determine record count
 
  1000 records found
@@ -2181,20 +1437,26 @@ starting epoch 11 at record count 1100, and file position 100
 reading from record 0 to 100 to be positioned properly for epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 11: 6, 31, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.03456436; EvalErr[0]PerSample = 0.02000000; TotalTime = 0.15455s; TotalTimePerSample = 1.54549ms; SamplesPerSecond = 647
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0345644; EvalErrPerSample = 0.02; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.176761
-CNTKCommandTrainEnd: Train
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 11: 6, 31, ...
+ Epoch[12 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.33976151; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.7157s; SamplesPerSecond = 139.7
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.3397615; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.736517
+CNTKCommandTrainEnd: train
+
+Post-processing network...
+
+3 roots:
+	outputNodes.z = Plus
+	ce = CrossEntropyWithSoftmax
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2215,15 +1477,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node outputNodes.z. 14 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2244,15 +1504,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2273,280 +1531,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node CE. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node CE, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2567,15 +1562,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2596,15 +1591,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2625,19 +1620,18 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2658,15 +1652,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2687,15 +1681,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2716,17 +1710,21 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
 starting epoch 0 at record count 0, and file position 0
 already there from last epoch
-randomordering: 11 retries for 100 elements (11.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 15, 33, ...
-Final Results: Minibatch[1-1]: Samples Seen = 100    Err: ErrorPrediction/Sample = 0    CE: CrossEntropyWithSoftmax/Sample = 0.90504265    Perplexity = 2.4720373    
+RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 15, 33, ...
+Final Results: Minibatch[1-1]: Samples Seen = 100    err: ErrorPrediction/Sample = 0    ce: CrossEntropyWithSoftmax/Sample = 0.30440022    Perplexity = 1.3558116    
 COMPLETED

From ef80d86dedac4b21b58970bfe05c343f8398028f Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Fri, 18 Dec 2015 23:41:59 +0000
Subject: [PATCH 19/19] Updated Linux baselines.

---
 .../QuickE2E/baseline.linux.debug.gpu.txt     | 2679 +++++------------
 .../QuickE2E/baseline.linux.release.gpu.txt   | 2566 +++++-----------
 2 files changed, 1589 insertions(+), 3656 deletions(-)

diff --git a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt
index ced418fd8..3c5b6f82d 100644
--- a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt
+++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt
@@ -1,244 +1,247 @@
-running on localhost at 2015/11/23 11:42:03
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Dec 18 2015 23:32:02
+		Last modified date: Fri Dec 18 23:24:08 2015
+		Build type: release
+		Math lib: acml
+		CUDA_PATH: /usr/local/cuda-7.0
+		CUB_PATH: /usr/local/cub-1.4.1
+		Build Branch: master
+		Build SHA1: f675c24ad6e803523212d772c27ae2c2c98b6ce9
+-------------------------------------------------------------------
+running on localhost at 2015/12/18 23:38:54
 command line: 
-/home/alexey/Projects/cntk/bin/cntk configFile=./QuickE2E/cntk.config DataDir=./Data RunDir=. ConfigDir=./QuickE2E DeviceId=0 
+../../../bin/cntk configFile=QuickE2E/cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=gpu.txt 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=$DeviceId$
-ndlMacros=$ConfigDir$/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=$RunDir$/models/cntk.dnn
-    deviceId=$DeviceId$
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=$ConfigDir$/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = $DeviceId$
+ndlMacros = "$ConfigDir$/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=$RunDir$/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=$ConfigDir$/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-DataDir=./Data
-RunDir=.
-ConfigDir=./QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=0
-ndlMacros=./QuickE2E/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=./models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=./QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = Auto
+ndlMacros = "QuickE2E/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=./models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=./QuickE2E/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-DataDir=./Data
-RunDir=.
-ConfigDir=./QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-configparameters: cntk.config:command=Train:Test
-configparameters: cntk.config:ConfigDir=./QuickE2E
-configparameters: cntk.config:DataDir=./Data
-configparameters: cntk.config:deviceId=0
-configparameters: cntk.config:ndlMacros=./QuickE2E/Macros.ndl
+configparameters: cntk.config:command=train:test
+configparameters: cntk.config:ConfigDir=QuickE2E
+configparameters: cntk.config:DataDir=Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl
+configparameters: cntk.config:numCPUThreads=8
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=.
-configparameters: cntk.config:Test=[
-    action=test
-    modelPath=./models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=./QuickE2E/Convolution.ndl
+configparameters: cntk.config:RunDir=_out
+configparameters: cntk.config:stderr=gpu.txt
+configparameters: cntk.config:test=[
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
-configparameters: cntk.config:Train=[
-    action=train
-    modelPath=./models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=./QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+configparameters: cntk.config:train=[
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-command: Train Test 
+command: train test 
 precision = float
-CNTKModelPath: ./models/cntk.dnn
-CNTKCommandTrainInfo: Train : 12
+Using 8 CPU threads
+CNTKModelPath: _out/models/cntk.dnn
+CNTKCommandTrainInfo: train : 12
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12
-CNTKCommandTrainBegin: Train
+CNTKCommandTrainBegin: train
+LockDevice: Locked GPU 0 to test availability.
+LockDevice: Unlocked GPU 0 after testing.
+LockDevice: Locked GPU 1 to test availability.
+LockDevice: Unlocked GPU 1 after testing.
+LockDevice: Locked GPU 2 to test availability.
+LockDevice: Unlocked GPU 2 after testing.
+LockDevice: Locked GPU 3 to test availability.
+LockDevice: Unlocked GPU 3 after testing.
+LockDevice: Locked GPU 0 for exclusive use.
 NDLBuilder Using GPU 0
-reading uci file ./Data/Train.txt
+Reading UCI file Data/Train.txt
+SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+
+Post-processing network...
+
+3 roots:
+	outputNodes.z = Plus
+	ce = CrossEntropyWithSoftmax
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-
-Printing Gradient Computation Node Order ... 
-
-CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 1], OutputNodes.z[0, 0])
-OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1])
-OutputNodes.b[10, 1] = LearnableParameter
-OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0])
-h1.y[0, 0] = Sigmoid(h1.z[0, 0])
-h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1])
-h1.b[128, 1] = LearnableParameter
-h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0])
-pool2[0, 0] = AveragePooling(conv2_act.act[0, 0])
-conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0])
-conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1])
-conv2_act.convB[32, 1] = LearnableParameter
-conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0])
-pool1[0, 0] = MaxPooling(conv1_act.act[0, 0])
-conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0])
-conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1])
-conv1_act.convB[16, 1] = LearnableParameter
-conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0])
-featScaled[0, 0] = Scale(featScale[1, 1], features[784, 1])
-features[784, 1] = InputValue
-featScale[1, 1] = LearnableParameter
-conv1_act.convW[16, 25] = LearnableParameter
-conv2_act.convW[32, 400] = LearnableParameter
-h1.W[128, 512] = LearnableParameter
-OutputNodes.W[10, 128] = LearnableParameter
-labels[10, 1] = InputValue
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -247,27 +250,25 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node outputNodes.z. 14 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -276,27 +277,25 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -305,292 +304,29 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node CE. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node CE, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -599,27 +335,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -628,27 +364,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -657,31 +393,30 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -690,27 +425,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -719,27 +454,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -748,35 +483,40 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
-SetUniformRandomValue (GPU): creating curand object with seed 1
+Post-processing network complete.
+
 SGD using GPU 0.
-GetTrainCriterionNodes  ...
-GetEvalCriterionNodes  ...
+
+Training criterion node(s):
+	ce = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	err = ErrorPrediction
 
 
-Allocating matrices for gradient computing
+Allocating matrices for forward and/or backward propagation.
 No PreCompute nodes found, skipping PreCompute step
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.000000 
+Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting at epoch 0 counting lines to determine record count
 
  1000 records found
@@ -784,148 +524,126 @@ starting epoch 0 at record count 0, and file position 0
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 38, 46, ...
- Epoch[ 1 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.41911163; EvalErr[0]PerSample = 0.92000000; TotalTime = 0.53526s; TotalTimePerSample = 5.35259ms; SamplesPerSecond = 186
-Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.4191115; EvalErrPerSample = 0.91999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.558449
-Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 38, 46, ...
+ Epoch[ 1 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.39150986; EvalErr[0]PerSample = 0.94000000; TotalTime = 0.1702s; SamplesPerSecond = 587.5
+Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3915098; EvalErrPerSample = 0.94; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.191305
+Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 1 at record count 100, and file position 100
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 1: 38, 46, ...
- Epoch[ 2 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.38765198; EvalErr[0]PerSample = 0.89000000; TotalTime = 0.06055s; TotalTimePerSample = 0.60545ms; SamplesPerSecond = 1651
-Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.3876519; EvalErrPerSample = 0.88999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.060761
-Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 1: 38, 46, ...
+ Epoch[ 2 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.29544342; EvalErr[0]PerSample = 0.87000000; TotalTime = 0.0532s; SamplesPerSecond = 1878.2
+Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.2954433; EvalErrPerSample = 0.87; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.053534
+Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 2 at record count 200, and file position 200
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 30 retries for 100 elements (30.0%) to ensure window condition
-randomordering: recached sequence for seed 2: 34, 6, ...
- Epoch[ 3 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.30177277; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.06050s; TotalTimePerSample = 0.60495ms; SamplesPerSecond = 1653
-Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.3017728; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.060688
-Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 30 retries for 100 elements (30.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 2: 34, 6, ...
+ Epoch[ 3 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.11703644; EvalErr[0]PerSample = 0.69000000; TotalTime = 0.0535s; SamplesPerSecond = 1870.5
+Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.1170363; EvalErrPerSample = 0.69; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.053674
+Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 3 at record count 300, and file position 300
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 14 retries for 100 elements (14.0%) to ensure window condition
-randomordering: recached sequence for seed 3: 35, 34, ...
- Epoch[ 4 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.33002518; EvalErr[0]PerSample = 0.89000000; TotalTime = 0.05966s; TotalTimePerSample = 0.59664ms; SamplesPerSecond = 1676
-Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.3300252; EvalErrPerSample = 0.88999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.059867
-Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 3: 35, 34, ...
+ Epoch[ 4 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.99407211; EvalErr[0]PerSample = 0.65000000; TotalTime = 0.0541s; SamplesPerSecond = 1847.6
+Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 1.9940721; EvalErrPerSample = 0.64999998; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.05433
+Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 4 at record count 400, and file position 400
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 13 retries for 100 elements (13.0%) to ensure window condition
-randomordering: recached sequence for seed 4: 30, 23, ...
- Epoch[ 5 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.23725708; EvalErr[0]PerSample = 0.88000000; TotalTime = 0.05858s; TotalTimePerSample = 0.58577ms; SamplesPerSecond = 1707
-Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 2.237257; EvalErrPerSample = 0.88; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.058768
-Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 4: 30, 23, ...
+ Epoch[ 5 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.72756714; EvalErr[0]PerSample = 0.45000000; TotalTime = 0.0555s; SamplesPerSecond = 1801.4
+Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 1.7275671; EvalErrPerSample = 0.44999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.055725
+Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 5 at record count 500, and file position 500
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 25 retries for 100 elements (25.0%) to ensure window condition
-randomordering: recached sequence for seed 5: 33, 43, ...
- Epoch[ 6 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.24089386; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.05882s; TotalTimePerSample = 0.58824ms; SamplesPerSecond = 1699
-Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 2.2408938; EvalErrPerSample = 0.89999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.059015
-Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 25 retries for 100 elements (25.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 5: 33, 43, ...
+ Epoch[ 6 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.51963303; EvalErr[0]PerSample = 0.21000000; TotalTime = 0.0539s; SamplesPerSecond = 1854.3
+Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 1.5196329; EvalErrPerSample = 0.20999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054135
+Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 6 at record count 600, and file position 600
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 14 retries for 100 elements (14.0%) to ensure window condition
-randomordering: recached sequence for seed 6: 12, 17, ...
- Epoch[ 7 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.15189026; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.05783s; TotalTimePerSample = 0.57827ms; SamplesPerSecond = 1729
-Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 2.1518903; EvalErrPerSample = 0.79999995; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.058039
-Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 6: 12, 17, ...
+ Epoch[ 7 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.29057953; EvalErr[0]PerSample = 0.20000000; TotalTime = 0.0548s; SamplesPerSecond = 1823.9
+Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 1.2905796; EvalErrPerSample = 0.19999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.055041
+Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 7 at record count 700, and file position 700
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 14 retries for 100 elements (14.0%) to ensure window condition
-randomordering: recached sequence for seed 7: 40, 7, ...
- Epoch[ 8 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.02036377; EvalErr[0]PerSample = 0.68000000; TotalTime = 0.05703s; TotalTimePerSample = 0.57030ms; SamplesPerSecond = 1753
-Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 2.0203638; EvalErrPerSample = 0.68000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.057228
-Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 7: 40, 7, ...
+ Epoch[ 8 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.97354042; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.0543s; SamplesPerSecond = 1841.6
+Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 0.97354043; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054515
+Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 8 at record count 800, and file position 800
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 17 retries for 100 elements (17.0%) to ensure window condition
-randomordering: recached sequence for seed 8: 8, 48, ...
- Epoch[ 9 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.74879242; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.05806s; TotalTimePerSample = 0.58065ms; SamplesPerSecond = 1722
-Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 1.7487924; EvalErrPerSample = 0.44; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.058275
-Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 8: 8, 48, ...
+ Epoch[ 9 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.73900383; EvalErr[0]PerSample = 0.03000000; TotalTime = 0.0544s; SamplesPerSecond = 1837.7
+Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 0.73900384; EvalErrPerSample = 0.029999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054655
+Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 9 at record count 900, and file position 900
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 9: 14, 26, ...
- Epoch[10 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.56006454; EvalErr[0]PerSample = 0.18000000; TotalTime = 0.05721s; TotalTimePerSample = 0.57207ms; SamplesPerSecond = 1748
-Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 1.5600646; EvalErrPerSample = 0.17999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.057409
-Starting Epoch 11: learning rate per sample = 0.005000  effective momentum = 0.700000 
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 9: 14, 26, ...
+ Epoch[10 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.57405857; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0542s; SamplesPerSecond = 1846.4
+Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 0.57405853; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054379
+Starting Epoch 11: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting epoch 10 at record count 1000, and file position 0
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 31 retries for 100 elements (31.0%) to ensure window condition
-randomordering: recached sequence for seed 10: 22, 4, ...
- Epoch[11 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.32553162; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.05778s; TotalTimePerSample = 0.57785ms; SamplesPerSecond = 1730
-Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 1.3255316; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.058009
-Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000 
+RandomOrdering: 31 retries for 100 elements (31.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 10: 22, 4, ...
+ Epoch[11 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.45112953; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0591s; SamplesPerSecond = 1690.7
+Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 0.45112953; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.05945
+Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting epoch 11 at record count 1100, and file position 100
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 17 retries for 100 elements (17.0%) to ensure window condition
-randomordering: recached sequence for seed 11: 2, 40, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.01003433; EvalErr[0]PerSample = 0.03000000; TotalTime = 0.05721s; TotalTimePerSample = 0.57209ms; SamplesPerSecond = 1747
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0100343; EvalErrPerSample = 0.029999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.057407
-CNTKCommandTrainEnd: Train
+RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 11: 2, 40, ...
+ Epoch[12 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.34545708; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0540s; SamplesPerSecond = 1851.0
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.34545708; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054291
+CNTKCommandTrainEnd: train
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax
+	err = ErrorPrediction
+	outputNodes.z = Plus
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
 
 
-Allocating matrices for forward propagation.
-
-
-Printing Gradient Computation Node Order ... 
-
-CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0])
-OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1])
-OutputNodes.b[10, 1] = LearnableParameter
-OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0])
-h1.y[0, 0] = Sigmoid(h1.z[0, 0])
-h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1])
-h1.b[128, 1] = LearnableParameter
-h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0])
-pool2[0, 0] = AveragePooling(conv2_act.act[0, 0])
-conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0])
-conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1])
-conv2_act.convB[32, 1] = LearnableParameter
-conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0])
-pool1[0, 0] = MaxPooling(conv1_act.act[0, 0])
-conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0])
-conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1])
-conv1_act.convB[16, 1] = LearnableParameter
-conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0])
-featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0])
-features[784, 0] = InputValue
-featScale[1, 1] = LearnableParameter
-conv1_act.convW[16, 25] = LearnableParameter
-conv2_act.convW[32, 400] = LearnableParameter
-h1.W[128, 512] = LearnableParameter
-OutputNodes.W[10, 128] = LearnableParameter
-labels[10, 0] = InputValue
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -946,15 +664,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node ce. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -975,15 +693,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1004,19 +722,18 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1037,15 +754,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1066,15 +783,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1095,18 +812,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1127,13 +843,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1154,13 +870,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node OutputNodes.z, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1181,530 +897,271 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
+Post-processing network complete.
 evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
 starting epoch 0 at record count 0, and file position 0
 already there from last epoch
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 38, 46, ...
-Final Results: Minibatch[1-1]: Samples Seen = 100    Err: ErrorPrediction/Sample = 0    CE: CrossEntropyWithSoftmax/Sample = 0.84035759    Perplexity = 2.3171954    
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 38, 46, ...
+Final Results: Minibatch[1-1]: Samples Seen = 100    err: ErrorPrediction/Sample = 0    ce: CrossEntropyWithSoftmax/Sample = 0.30271576    Perplexity = 1.3535297    
 COMPLETED
 === Deleting last epoch data
 ==== Re-running from checkpoint
-running on localhost at 2015/11/23 11:43:57
+
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Dec 18 2015 23:32:02
+		Last modified date: Fri Dec 18 23:24:08 2015
+		Build type: release
+		Math lib: acml
+		CUDA_PATH: /usr/local/cuda-7.0
+		CUB_PATH: /usr/local/cub-1.4.1
+		Build Branch: master
+		Build SHA1: f675c24ad6e803523212d772c27ae2c2c98b6ce9
+-------------------------------------------------------------------
+running on localhost at 2015/12/18 23:41:15
 command line: 
-/home/alexey/Projects/cntk/bin/cntk configFile=./QuickE2E/cntk.config DataDir=./Data RunDir=. ConfigDir=./QuickE2E DeviceId=0 
+../../../bin/cntk configFile=QuickE2E/cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=gpu.txt 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=$DeviceId$
-ndlMacros=$ConfigDir$/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=$RunDir$/models/cntk.dnn
-    deviceId=$DeviceId$
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=$ConfigDir$/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = $DeviceId$
+ndlMacros = "$ConfigDir$/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=$RunDir$/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=$ConfigDir$/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-DataDir=./Data
-RunDir=.
-ConfigDir=./QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=0
-ndlMacros=./QuickE2E/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=./models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=./QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = Auto
+ndlMacros = "QuickE2E/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=./models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=./QuickE2E/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-DataDir=./Data
-RunDir=.
-ConfigDir=./QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-configparameters: cntk.config:command=Train:Test
-configparameters: cntk.config:ConfigDir=./QuickE2E
-configparameters: cntk.config:DataDir=./Data
-configparameters: cntk.config:deviceId=0
-configparameters: cntk.config:ndlMacros=./QuickE2E/Macros.ndl
+configparameters: cntk.config:command=train:test
+configparameters: cntk.config:ConfigDir=QuickE2E
+configparameters: cntk.config:DataDir=Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl
+configparameters: cntk.config:numCPUThreads=8
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=.
-configparameters: cntk.config:Test=[
-    action=test
-    modelPath=./models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=./QuickE2E/Convolution.ndl
+configparameters: cntk.config:RunDir=_out
+configparameters: cntk.config:stderr=gpu.txt
+configparameters: cntk.config:test=[
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
-configparameters: cntk.config:Train=[
-    action=train
-    modelPath=./models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=./QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+configparameters: cntk.config:train=[
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-command: Train Test 
+command: train test 
 precision = float
-CNTKModelPath: ./models/cntk.dnn
-CNTKCommandTrainInfo: Train : 12
+Using 8 CPU threads
+CNTKModelPath: _out/models/cntk.dnn
+CNTKCommandTrainInfo: train : 12
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12
-CNTKCommandTrainBegin: Train
+CNTKCommandTrainBegin: train
+LockDevice: Locked GPU 0 to test availability.
+LockDevice: Unlocked GPU 0 after testing.
+LockDevice: Locked GPU 1 to test availability.
+LockDevice: Unlocked GPU 1 after testing.
+LockDevice: Locked GPU 2 to test availability.
+LockDevice: Unlocked GPU 2 after testing.
+LockDevice: Locked GPU 3 to test availability.
+LockDevice: Unlocked GPU 3 after testing.
+LockDevice: Locked GPU 0 for exclusive use.
 NDLBuilder Using GPU 0
-reading uci file ./Data/Train.txt
-Starting from checkpoint. Load Network From File ./models/cntk.dnn.11.
+Reading UCI file Data/Train.txt
+Starting from checkpoint. Load Network From File _out/models/cntk.dnn.11.
+
+Post-processing network...
+
+3 roots:
+	err = ErrorPrediction
+	outputNodes.z = Plus
+	ce = CrossEntropyWithSoftmax
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
 
 
-Allocating matrices for forward propagation.
-
-
-Printing Gradient Computation Node Order ... 
-
-CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0])
-OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1])
-OutputNodes.b[10, 1] = LearnableParameter
-OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0])
-h1.y[0, 0] = Sigmoid(h1.z[0, 0])
-h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1])
-h1.b[128, 1] = LearnableParameter
-h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0])
-pool2[0, 0] = AveragePooling(conv2_act.act[0, 0])
-conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0])
-conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1])
-conv2_act.convB[32, 1] = LearnableParameter
-conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0])
-pool1[0, 0] = MaxPooling(conv1_act.act[0, 0])
-conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0])
-conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1])
-conv1_act.convB[16, 1] = LearnableParameter
-conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0])
-featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0])
-features[784, 0] = InputValue
-featScale[1, 1] = LearnableParameter
-conv1_act.convW[16, 25] = LearnableParameter
-conv2_act.convW[32, 400] = LearnableParameter
-h1.W[128, 512] = LearnableParameter
-OutputNodes.W[10, 128] = LearnableParameter
-labels[10, 0] = InputValue
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1725,15 +1182,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node err. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1754,15 +1211,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1783,19 +1240,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1816,15 +1271,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1845,15 +1298,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1874,189 +1325,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2077,15 +1356,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2106,15 +1385,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2135,114 +1414,29 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
+Post-processing network complete.
 
 SGD using GPU 0.
-GetTrainCriterionNodes  ...
-GetEvalCriterionNodes  ...
+
+Training criterion node(s):
+	ce = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	err = ErrorPrediction
 
 
-Allocating matrices for gradient computing
+Allocating matrices for forward and/or backward propagation.
 No PreCompute nodes found, skipping PreCompute step
 Warning: checkpoint file is missing. learning parameters will be initialized from 0
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000 
+Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting at epoch 11 counting lines to determine record count
 
  1000 records found
@@ -2250,49 +1444,26 @@ starting epoch 11 at record count 1100, and file position 100
 reading from record 0 to 100 to be positioned properly for epoch
 
 Starting minibatch loop.
-randomordering: 17 retries for 100 elements (17.0%) to ensure window condition
-randomordering: recached sequence for seed 11: 2, 40, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.01463303; EvalErr[0]PerSample = 0.02000000; TotalTime = 0.12786s; TotalTimePerSample = 1.27864ms; SamplesPerSecond = 782
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0146331; EvalErrPerSample = 0.02; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.152772
-CNTKCommandTrainEnd: Train
+RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 11: 2, 40, ...
+ Epoch[12 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.34671265; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.1653s; SamplesPerSecond = 604.8
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.34671265; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.188096
+CNTKCommandTrainEnd: train
+
+Post-processing network...
+
+3 roots:
+	outputNodes.z = Plus
+	err = ErrorPrediction
+	ce = CrossEntropyWithSoftmax
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
 
 
-Allocating matrices for forward propagation.
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-
-Printing Gradient Computation Node Order ... 
-
-CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0])
-OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1])
-OutputNodes.b[10, 1] = LearnableParameter
-OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0])
-h1.y[0, 0] = Sigmoid(h1.z[0, 0])
-h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1])
-h1.b[128, 1] = LearnableParameter
-h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0])
-pool2[0, 0] = AveragePooling(conv2_act.act[0, 0])
-conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0])
-conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1])
-conv2_act.convB[32, 1] = LearnableParameter
-conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0])
-pool1[0, 0] = MaxPooling(conv1_act.act[0, 0])
-conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0])
-conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1])
-conv1_act.convB[16, 1] = LearnableParameter
-conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0])
-featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0])
-features[784, 0] = InputValue
-featScale[1, 1] = LearnableParameter
-conv1_act.convW[16, 25] = LearnableParameter
-conv2_act.convW[32, 400] = LearnableParameter
-h1.W[128, 512] = LearnableParameter
-OutputNodes.W[10, 128] = LearnableParameter
-labels[10, 0] = InputValue
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2313,15 +1484,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node outputNodes.z. 14 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2342,15 +1511,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2371,280 +1538,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node CE. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node CE, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2665,15 +1569,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2694,15 +1598,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2723,19 +1627,18 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2756,15 +1659,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2785,15 +1688,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2814,17 +1717,21 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
 starting epoch 0 at record count 0, and file position 0
 already there from last epoch
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 38, 46, ...
-Final Results: Minibatch[1-1]: Samples Seen = 100    Err: ErrorPrediction/Sample = 0    CE: CrossEntropyWithSoftmax/Sample = 0.88667282    Perplexity = 2.427041    
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 38, 46, ...
+Final Results: Minibatch[1-1]: Samples Seen = 100    err: ErrorPrediction/Sample = 0    ce: CrossEntropyWithSoftmax/Sample = 0.31798401    Perplexity = 1.3743543    
 COMPLETED
diff --git a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt
index bbba1a1b9..2dcc47bf6 100644
--- a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt
+++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt
@@ -1,215 +1,247 @@
-running on localhost at 2015/11/23 11:51:07
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Dec 18 2015 23:17:10
+		Last modified date: Fri Dec 18 23:16:43 2015
+		Build type: release
+		Math lib: acml
+		CUDA_PATH: /usr/local/cuda-7.0
+		CUB_PATH: /usr/local/cub-1.4.1
+		Build Branch: master
+		Build SHA1: f675c24ad6e803523212d772c27ae2c2c98b6ce9
+-------------------------------------------------------------------
+running on localhost at 2015/12/18 23:25:20
 command line: 
-/home/alexey/Projects/cntk/bin/cntk configFile=./QuickE2E/cntk.config DataDir=./Data RunDir=. ConfigDir=./QuickE2E DeviceId=0 
+../../../bin/cntk configFile=QuickE2E/cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_outgpu.txt 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=$DeviceId$
-ndlMacros=$ConfigDir$/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=$RunDir$/models/cntk.dnn
-    deviceId=$DeviceId$
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=$ConfigDir$/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = $DeviceId$
+ndlMacros = "$ConfigDir$/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=$RunDir$/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=$ConfigDir$/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-DataDir=./Data
-RunDir=.
-ConfigDir=./QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_outgpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=0
-ndlMacros=./QuickE2E/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=./models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=./QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = Auto
+ndlMacros = "QuickE2E/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=./models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=./QuickE2E/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-DataDir=./Data
-RunDir=.
-ConfigDir=./QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=_outgpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-configparameters: cntk.config:command=Train:Test
-configparameters: cntk.config:ConfigDir=./QuickE2E
-configparameters: cntk.config:DataDir=./Data
-configparameters: cntk.config:deviceId=0
-configparameters: cntk.config:ndlMacros=./QuickE2E/Macros.ndl
+configparameters: cntk.config:command=train:test
+configparameters: cntk.config:ConfigDir=QuickE2E
+configparameters: cntk.config:DataDir=Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl
+configparameters: cntk.config:numCPUThreads=8
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=.
-configparameters: cntk.config:Test=[
-    action=test
-    modelPath=./models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=./QuickE2E/Convolution.ndl
+configparameters: cntk.config:RunDir=_out
+configparameters: cntk.config:stderr=_outgpu.txt
+configparameters: cntk.config:test=[
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
-configparameters: cntk.config:Train=[
-    action=train
-    modelPath=./models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=./QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+configparameters: cntk.config:train=[
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-command: Train Test 
+command: train test 
 precision = float
-CNTKModelPath: ./models/cntk.dnn
-CNTKCommandTrainInfo: Train : 12
+Using 8 CPU threads
+CNTKModelPath: _out/models/cntk.dnn
+CNTKCommandTrainInfo: train : 12
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12
-CNTKCommandTrainBegin: Train
+CNTKCommandTrainBegin: train
+LockDevice: Locked GPU 0 to test availability.
+LockDevice: Unlocked GPU 0 after testing.
+LockDevice: Locked GPU 1 to test availability.
+LockDevice: Unlocked GPU 1 after testing.
+LockDevice: Locked GPU 2 to test availability.
+LockDevice: Unlocked GPU 2 after testing.
+LockDevice: Locked GPU 3 to test availability.
+LockDevice: Unlocked GPU 3 after testing.
+LockDevice: Locked GPU 0 for exclusive use.
 NDLBuilder Using GPU 0
-reading uci file ./Data/Train.txt
+Reading UCI file Data/Train.txt
+SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+
+Post-processing network...
+
+3 roots:
+	outputNodes.z = Plus
+	ce = CrossEntropyWithSoftmax
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -218,27 +250,25 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node outputNodes.z. 14 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -247,27 +277,25 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -276,292 +304,29 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node CE. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-Validating for node CE, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 1]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -570,27 +335,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -599,27 +364,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -628,31 +393,30 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -661,27 +425,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -690,27 +454,27 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 1]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -719,35 +483,40 @@ Validating --> features = InputValue -> [784, MBSize 1]
 Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1]
 Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1]
 Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1]
+Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
+Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
+Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
+Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
 Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1]
+Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
+Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
+Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
+Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
 Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1]
+Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
+Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
-SetUniformRandomValue (GPU): creating curand object with seed 1
+Post-processing network complete.
+
 SGD using GPU 0.
-GetTrainCriterionNodes  ...
-GetEvalCriterionNodes  ...
+
+Training criterion node(s):
+	ce = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	err = ErrorPrediction
 
 
-Allocating matrices for gradient computing
+Allocating matrices for forward and/or backward propagation.
 No PreCompute nodes found, skipping PreCompute step
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.000000 
+Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting at epoch 0 counting lines to determine record count
 
  1000 records found
@@ -755,119 +524,127 @@ starting epoch 0 at record count 0, and file position 0
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 38, 46, ...
- Epoch[ 1 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.41911163; EvalErr[0]PerSample = 0.92000000; TotalTime = 0.10084s; TotalTimePerSample = 1.00839ms; SamplesPerSecond = 991
-Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.4191115; EvalErrPerSample = 0.91999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.114029
-Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 38, 46, ...
+MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 10 x 1
+ Epoch[ 1 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.39150986; EvalErr[0]PerSample = 0.94000000; TotalTime = 0.5221s; SamplesPerSecond = 191.5
+Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3915098; EvalErrPerSample = 0.94; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.542948
+Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 1 at record count 100, and file position 100
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 1: 38, 46, ...
- Epoch[ 2 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.38765198; EvalErr[0]PerSample = 0.89000000; TotalTime = 0.03237s; TotalTimePerSample = 0.32373ms; SamplesPerSecond = 3088
-Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.3876519; EvalErrPerSample = 0.88999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032542
-Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 1: 38, 46, ...
+ Epoch[ 2 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.29544357; EvalErr[0]PerSample = 0.87000000; TotalTime = 0.0198s; SamplesPerSecond = 5052.8
+Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.2954435; EvalErrPerSample = 0.87; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020015
+Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 2 at record count 200, and file position 200
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 30 retries for 100 elements (30.0%) to ensure window condition
-randomordering: recached sequence for seed 2: 34, 6, ...
- Epoch[ 3 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.30177277; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.03249s; TotalTimePerSample = 0.32492ms; SamplesPerSecond = 3077
-Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.3017728; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032668
-Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 30 retries for 100 elements (30.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 2: 34, 6, ...
+ Epoch[ 3 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  2.11703644; EvalErr[0]PerSample = 0.69000000; TotalTime = 0.0193s; SamplesPerSecond = 5173.0
+Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.1170363; EvalErrPerSample = 0.69; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019492
+Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 3 at record count 300, and file position 300
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 14 retries for 100 elements (14.0%) to ensure window condition
-randomordering: recached sequence for seed 3: 35, 34, ...
- Epoch[ 4 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.33002518; EvalErr[0]PerSample = 0.89000000; TotalTime = 0.03224s; TotalTimePerSample = 0.32243ms; SamplesPerSecond = 3101
-Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.3300252; EvalErrPerSample = 0.88999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032407
-Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 3: 35, 34, ...
+ Epoch[ 4 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.99407211; EvalErr[0]PerSample = 0.65000000; TotalTime = 0.0197s; SamplesPerSecond = 5072.5
+Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 1.9940721; EvalErrPerSample = 0.64999998; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019875
+Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 4 at record count 400, and file position 400
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 13 retries for 100 elements (13.0%) to ensure window condition
-randomordering: recached sequence for seed 4: 30, 23, ...
- Epoch[ 5 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.23725708; EvalErr[0]PerSample = 0.88000000; TotalTime = 0.03227s; TotalTimePerSample = 0.32265ms; SamplesPerSecond = 3099
-Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 2.237257; EvalErrPerSample = 0.88; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.03243
-Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 4: 30, 23, ...
+ Epoch[ 5 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.72756805; EvalErr[0]PerSample = 0.45000000; TotalTime = 0.0228s; SamplesPerSecond = 4381.2
+Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 1.727568; EvalErrPerSample = 0.44999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.023131
+Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 5 at record count 500, and file position 500
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 25 retries for 100 elements (25.0%) to ensure window condition
-randomordering: recached sequence for seed 5: 33, 43, ...
- Epoch[ 6 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.24089386; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.03225s; TotalTimePerSample = 0.32247ms; SamplesPerSecond = 3101
-Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 2.2408938; EvalErrPerSample = 0.89999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032414
-Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 25 retries for 100 elements (25.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 5: 33, 43, ...
+ Epoch[ 6 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.51963470; EvalErr[0]PerSample = 0.21000000; TotalTime = 0.0198s; SamplesPerSecond = 5047.2
+Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 1.5196347; EvalErrPerSample = 0.20999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020056
+Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 6 at record count 600, and file position 600
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 14 retries for 100 elements (14.0%) to ensure window condition
-randomordering: recached sequence for seed 6: 12, 17, ...
- Epoch[ 7 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.15189026; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.03228s; TotalTimePerSample = 0.32278ms; SamplesPerSecond = 3098
-Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 2.1518903; EvalErrPerSample = 0.79999995; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032436
-Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 6: 12, 17, ...
+ Epoch[ 7 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  1.29057785; EvalErr[0]PerSample = 0.20000000; TotalTime = 0.0200s; SamplesPerSecond = 4995.5
+Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 1.2905778; EvalErrPerSample = 0.19999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020179
+Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 7 at record count 700, and file position 700
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 14 retries for 100 elements (14.0%) to ensure window condition
-randomordering: recached sequence for seed 7: 40, 7, ...
- Epoch[ 8 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  2.02036377; EvalErr[0]PerSample = 0.68000000; TotalTime = 0.03236s; TotalTimePerSample = 0.32362ms; SamplesPerSecond = 3090
-Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 2.0203638; EvalErrPerSample = 0.68000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032545
-Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 7: 40, 7, ...
+ Epoch[ 8 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.97353966; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.0192s; SamplesPerSecond = 5198.3
+Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 0.97353965; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019408
+Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 8 at record count 800, and file position 800
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 17 retries for 100 elements (17.0%) to ensure window condition
-randomordering: recached sequence for seed 8: 8, 48, ...
- Epoch[ 9 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.74879242; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.03041s; TotalTimePerSample = 0.30406ms; SamplesPerSecond = 3288
-Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 1.7487924; EvalErrPerSample = 0.44; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.030574
-Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.000000 
+RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 8: 8, 48, ...
+ Epoch[ 9 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.73900352; EvalErr[0]PerSample = 0.03000000; TotalTime = 0.0192s; SamplesPerSecond = 5201.0
+Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 0.73900348; EvalErrPerSample = 0.029999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019389
+Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 starting epoch 9 at record count 900, and file position 900
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 9: 14, 26, ...
- Epoch[10 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.56006454; EvalErr[0]PerSample = 0.18000000; TotalTime = 0.03032s; TotalTimePerSample = 0.30320ms; SamplesPerSecond = 3298
-Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 1.5600646; EvalErrPerSample = 0.17999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.030483
-Starting Epoch 11: learning rate per sample = 0.005000  effective momentum = 0.700000 
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 9: 14, 26, ...
+ Epoch[10 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.57409992; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0193s; SamplesPerSecond = 5188.9
+Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 0.5740999; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019445
+Starting Epoch 11: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting epoch 10 at record count 1000, and file position 0
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 31 retries for 100 elements (31.0%) to ensure window condition
-randomordering: recached sequence for seed 10: 22, 4, ...
- Epoch[11 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.32553162; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.03050s; TotalTimePerSample = 0.30496ms; SamplesPerSecond = 3279
-Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 1.3255316; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.030661
-Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000 
+RandomOrdering: 31 retries for 100 elements (31.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 10: 22, 4, ...
+ Epoch[11 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.45136490; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0196s; SamplesPerSecond = 5107.5
+Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 0.45136487; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019784
+Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting epoch 11 at record count 1100, and file position 100
 already there from last epoch
 
 Starting minibatch loop.
-randomordering: 17 retries for 100 elements (17.0%) to ensure window condition
-randomordering: recached sequence for seed 11: 2, 40, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.01003433; EvalErr[0]PerSample = 0.03000000; TotalTime = 0.03054s; TotalTimePerSample = 0.30545ms; SamplesPerSecond = 3273
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0100343; EvalErrPerSample = 0.029999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.030702
-CNTKCommandTrainEnd: Train
+RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 11: 2, 40, ...
+ Epoch[12 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.34551861; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0194s; SamplesPerSecond = 5165.0
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.34551859; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019532
+CNTKCommandTrainEnd: train
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax
+	outputNodes.z = Plus
+	err = ErrorPrediction
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
 
 
-Allocating matrices for forward propagation.
-
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -888,15 +665,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node ce. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -917,15 +694,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -946,19 +723,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -979,15 +754,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1008,15 +781,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1037,189 +808,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1240,15 +839,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node err. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1269,15 +868,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1298,326 +897,272 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
 starting epoch 0 at record count 0, and file position 0
 already there from last epoch
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 38, 46, ...
-Final Results: Minibatch[1-1]: Samples Seen = 100    Err: ErrorPrediction/Sample = 0    CE: CrossEntropyWithSoftmax/Sample = 0.84035759    Perplexity = 2.3171954    
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 38, 46, ...
+MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 100 x 1
+Final Results: Minibatch[1-1]: Samples Seen = 100    err: ErrorPrediction/Sample = 0    ce: CrossEntropyWithSoftmax/Sample = 0.30270519    Perplexity = 1.3535154    
 COMPLETED
 === Deleting last epoch data
 ==== Re-running from checkpoint
-running on localhost at 2015/11/23 11:52:51
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Dec 18 2015 23:17:10
+		Last modified date: Fri Dec 18 23:16:43 2015
+		Build type: release
+		Math lib: acml
+		CUDA_PATH: /usr/local/cuda-7.0
+		CUB_PATH: /usr/local/cub-1.4.1
+		Build Branch: master
+		Build SHA1: f675c24ad6e803523212d772c27ae2c2c98b6ce9
+-------------------------------------------------------------------
+running on localhost at 2015/12/18 23:27:59
 command line: 
-/home/alexey/Projects/cntk/bin/cntk configFile=./QuickE2E/cntk.config DataDir=./Data RunDir=. ConfigDir=./QuickE2E DeviceId=0 
+../../../bin/cntk configFile=QuickE2E/cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=gpu.txt 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=$DeviceId$
-ndlMacros=$ConfigDir$/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=$RunDir$/models/cntk.dnn
-    deviceId=$DeviceId$
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=$ConfigDir$/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = $DeviceId$
+ndlMacros = "$ConfigDir$/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=$RunDir$/models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=$ConfigDir$/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "$RunDir$/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=$DataDir$/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=$DataDir$/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
         ]
     ]    
 ]
-DataDir=./Data
-RunDir=.
-ConfigDir=./QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-precision=float
-command=Train:Test
-deviceId=0
-ndlMacros=./QuickE2E/Macros.ndl
-parallelTrain=false
-Train=[
-    action=train
-    modelPath=./models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=./QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+precision = "float"
+command = train:test
+deviceId = Auto
+ndlMacros = "QuickE2E/Macros.ndl"
+parallelTrain = false
+numCPUThreads = 8
+train = [
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-Test=[
-    action=test
-    modelPath=./models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=./QuickE2E/Convolution.ndl
+test = [
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
-DataDir=./Data
-RunDir=.
-ConfigDir=./QuickE2E
-DeviceId=0
+ConfigDir=QuickE2E
+RunDir=_out
+DataDir=Data
+DeviceId=Auto
+stderr=gpu.txt
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-configparameters: cntk.config:command=Train:Test
-configparameters: cntk.config:ConfigDir=./QuickE2E
-configparameters: cntk.config:DataDir=./Data
-configparameters: cntk.config:deviceId=0
-configparameters: cntk.config:ndlMacros=./QuickE2E/Macros.ndl
+configparameters: cntk.config:command=train:test
+configparameters: cntk.config:ConfigDir=QuickE2E
+configparameters: cntk.config:DataDir=Data
+configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl
+configparameters: cntk.config:numCPUThreads=8
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=.
-configparameters: cntk.config:Test=[
-    action=test
-    modelPath=./models/cntk.dnn
-     NDLNetworkBuilder=[
-        networkDescription=./QuickE2E/Convolution.ndl
+configparameters: cntk.config:RunDir=_out
+configparameters: cntk.config:stderr=gpu.txt
+configparameters: cntk.config:test=[
+    action = "test"
+    modelPath = "_out/models/cntk.dnn"
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Test.txt
-        features=[
-            dim=784
-            start=1
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Test.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
-configparameters: cntk.config:Train=[
-    action=train
-    modelPath=./models/cntk.dnn
-    deviceId=0
-    traceLevel=1
-		NDLNetworkBuilder=[
-				networkDescription=./QuickE2E/Convolution.ndl
-		]
-    SGD=[
-        epochSize=100
-        minibatchSize=10
-        learningRatesPerMB=0.05
-        momentumPerMB=0*10:0.7
-        maxEpochs=12
+configparameters: cntk.config:train=[
+    action = "train"
+    modelPath = "_out/models/cntk.dnn"
+    traceLevel = 1
+    NDLNetworkBuilder = [
+        networkDescription = "QuickE2E/Convolution.ndl"
     ]
-    reader=[
-        readerType=UCIFastReader
-        file=./Data/Train.txt
-        features=[
-            dim=784
-            start=1
+    SGD = [
+        epochSize = 100
+        minibatchSize = 10
+        learningRatesPerMB = 0.05
+        momentumPerMB = 0*10:0.7
+        maxEpochs = 12
+    ]
+    reader = [
+        readerType = "UCIFastReader"
+        file = "Data/Train.txt"
+        features = [
+            dim = 784
+            start = 1
         ]
-        labels=[
-            dim=1
-            start=0
-            labelDim=10
-            labelMappingFile=./Data/labelsmap.txt
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "Data/labelsmap.txt"
         ]
     ]    
 ]
 
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-command: Train Test 
+command: train test 
 precision = float
-CNTKModelPath: ./models/cntk.dnn
-CNTKCommandTrainInfo: Train : 12
+Using 8 CPU threads
+CNTKModelPath: _out/models/cntk.dnn
+CNTKCommandTrainInfo: train : 12
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12
-CNTKCommandTrainBegin: Train
+CNTKCommandTrainBegin: train
+LockDevice: Locked GPU 0 to test availability.
+LockDevice: Unlocked GPU 0 after testing.
+LockDevice: Locked GPU 1 to test availability.
+LockDevice: Unlocked GPU 1 after testing.
+LockDevice: Locked GPU 2 to test availability.
+LockDevice: Unlocked GPU 2 after testing.
+LockDevice: Locked GPU 3 to test availability.
+LockDevice: Unlocked GPU 3 after testing.
+LockDevice: Locked GPU 0 for exclusive use.
 NDLBuilder Using GPU 0
-reading uci file ./Data/Train.txt
-Starting from checkpoint. Load Network From File ./models/cntk.dnn.11.
+Reading UCI file Data/Train.txt
+Starting from checkpoint. Load Network From File _out/models/cntk.dnn.11.
+
+Post-processing network...
+
+3 roots:
+	err = ErrorPrediction
+	outputNodes.z = Plus
+	ce = CrossEntropyWithSoftmax
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
 
 
-Allocating matrices for forward propagation.
-
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1638,15 +1183,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node err. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1667,15 +1212,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1696,19 +1241,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1729,15 +1272,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1758,15 +1299,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1787,189 +1326,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -1990,15 +1357,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2019,15 +1386,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2048,114 +1415,29 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
+Post-processing network complete.
 
 SGD using GPU 0.
-GetTrainCriterionNodes  ...
-GetEvalCriterionNodes  ...
+
+Training criterion node(s):
+	ce = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	err = ErrorPrediction
 
 
-Allocating matrices for gradient computing
+Allocating matrices for forward and/or backward propagation.
 No PreCompute nodes found, skipping PreCompute step
 Warning: checkpoint file is missing. learning parameters will be initialized from 0
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000 
+Starting Epoch 12: learning rate per sample = 0.005000  effective momentum = 0.700000  momentum as time constant = 28.0 samples
 starting at epoch 11 counting lines to determine record count
 
  1000 records found
@@ -2163,20 +1445,28 @@ starting epoch 11 at record count 1100, and file position 100
 reading from record 0 to 100 to be positioned properly for epoch
 
 Starting minibatch loop.
-randomordering: 17 retries for 100 elements (17.0%) to ensure window condition
-randomordering: recached sequence for seed 11: 2, 40, ...
- Epoch[12 of 12]-Minibatch[   1-  10 of 10]: SamplesSeen = 100; TrainLossPerSample =  1.01463303; EvalErr[0]PerSample = 0.02000000; TotalTime = 0.10305s; TotalTimePerSample = 1.03054ms; SamplesPerSecond = 970
-Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0146331; EvalErrPerSample = 0.02; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.117787
-CNTKCommandTrainEnd: Train
+RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 11: 2, 40, ...
+MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 10 x 1
+ Epoch[12 of 12]-Minibatch[   1-  10, 100.00%]: SamplesSeen = 100; TrainLossPerSample =  0.34676910; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.1298s; SamplesPerSecond = 770.6
+Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.34676909; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.143116
+CNTKCommandTrainEnd: train
+
+Post-processing network...
+
+3 roots:
+	err = ErrorPrediction
+	outputNodes.z = Plus
+	ce = CrossEntropyWithSoftmax
+FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation
+FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation
 
 
-Allocating matrices for forward propagation.
-
-
-Validating for node CE. 26 nodes to process in pass 1.
+Validating for node err. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2197,15 +1487,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE. 15 nodes to process in pass 2.
+Validating for node err. 15 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2226,15 +1516,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node CE, final verification.
+Validating for node err, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2255,19 +1545,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
 
+Validating for node outputNodes.z. 24 nodes to process in pass 1.
 
-Validating for node CE. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2288,15 +1576,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE. 14 nodes to process in pass 2.
+Validating for node outputNodes.z. 13 nodes to process in pass 2.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2317,15 +1603,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
-Validating for node CE, final verification.
+Validating for node outputNodes.z, final verification.
 
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2346,189 +1630,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
 
 9 out of 24 nodes do not share the minibatch layout with the input data.
 
 
-
-Validating for node OutputNodes.z. 24 nodes to process in pass 1.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z. 13 nodes to process in pass 2.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-Validating for node OutputNodes.z, final verification.
-
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-
-9 out of 24 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
+Validating for node ce. 26 nodes to process in pass 1.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2549,15 +1661,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 14 nodes to process in pass 2.
+Validating for node ce. 14 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2578,15 +1690,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
-Validating for node Err, final verification.
+Validating for node ce, final verification.
 
 Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
+Validating --> outputNodes.W = LearnableParameter -> [10, 128]
 Validating --> h1.W = LearnableParameter -> [128, 512]
 Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
 Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
@@ -2607,108 +1719,22 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0
 Validating --> h1.b = LearnableParameter -> [128, 1]
 Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
 Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-10 out of 26 nodes do not share the minibatch layout with the input data.
-
-
-
-Validating for node Err. 26 nodes to process in pass 1.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err. 14 nodes to process in pass 2.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
-
-Validating for node Err, final verification.
-
-Validating --> labels = InputValue -> [10, MBSize 0]
-Validating --> OutputNodes.W = LearnableParameter -> [10, 128]
-Validating --> h1.W = LearnableParameter -> [128, 512]
-Validating --> conv2_act.convW = LearnableParameter -> [32, 400]
-Validating --> conv1_act.convW = LearnableParameter -> [16, 25]
-Validating --> featScale = LearnableParameter -> [1, 1]
-Validating --> features = InputValue -> [784, MBSize 0]
-Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0]
-Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> conv1_act.convB = LearnableParameter -> [16, 1]
-Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0]
-Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0]
-Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0]
-Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> conv2_act.convB = LearnableParameter -> [32, 1]
-Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0]
-Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0]
-Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0]
-Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0]
-Validating --> h1.b = LearnableParameter -> [128, 1]
-Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0]
-Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0]
-Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
-Validating --> OutputNodes.b = LearnableParameter -> [10, 1]
-Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0]
-Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1]
+Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0]
+Validating --> outputNodes.b = LearnableParameter -> [10, 1]
+Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0]
+Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1]
 
 10 out of 26 nodes do not share the minibatch layout with the input data.
 
+Post-processing network complete.
 evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
 starting epoch 0 at record count 0, and file position 0
 already there from last epoch
-randomordering: 21 retries for 100 elements (21.0%) to ensure window condition
-randomordering: recached sequence for seed 0: 38, 46, ...
-Final Results: Minibatch[1-1]: Samples Seen = 100    Err: ErrorPrediction/Sample = 0    CE: CrossEntropyWithSoftmax/Sample = 0.88667282    Perplexity = 2.427041    
+RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition
+RandomOrdering: recached sequence for seed 0: 38, 46, ...
+MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 100 x 1
+Final Results: Minibatch[1-1]: Samples Seen = 100    err: ErrorPrediction/Sample = 0    ce: CrossEntropyWithSoftmax/Sample = 0.31781933    Perplexity = 1.374128    
 COMPLETED