From 397eac213fafd3aa507d422b37ea9627869f1589 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 16 Dec 2015 16:02:18 -0800 Subject: [PATCH 01/19] first experimental prototype of elementwise tensor op in PlusNode::ForwardProp() done except for actual kernel; new methods TensorShape::Pad() and Concat(); new method ComputationNode::GetTensorsForwardBinary(); moved ElementWiseOperator to CommonMatrix.h, using it in TensorView::DoSumOf(); TensorView::m_sob changed from ref to pointer to make the object copyable --- Source/CNTK/CNTK.cpp | 36 +++++------ Source/Common/Include/DataTensor.h | 46 +++++++++++--- .../ComputationNetworkLib/ComputationNode.cpp | 60 +++++++++++++++++++ .../ComputationNetworkLib/ComputationNode.h | 3 +- .../LinearAlgebraNodes.h | 6 ++ Source/Math/CommonMatrix.h | 25 ++++++++ Source/Math/GPUMatrix.h | 10 ---- Source/Math/TensorView.cpp | 14 +++-- Source/Math/TensorView.h | 13 ++-- 9 files changed, 162 insertions(+), 51 deletions(-) diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index 9e4bd127a..f2bd706bc 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -11,25 +11,8 @@ #define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _ #include "stdafx.h" -#include "Actions.h" -#include -#include -#include -#if defined(_WIN32) -#include "io.h" -#endif -#include "buildinfo.h" -#include "hostname.h" -#ifdef LEAKDETECT -#include "vld.h" // for memory leak detection -#endif -#include -#include -#include -#include -#include - #include "Basics.h" +#include "Actions.h" #include "ComputationNetwork.h" #include "ComputationNode.h" #include "DataReader.h" @@ -53,6 +36,23 @@ #include "BrainScriptEvaluator.h" #include "BrainScriptParser.h" +#include +#include +#include +#if defined(_WIN32) +#include "io.h" +#endif +#include "buildinfo.h" +#include "hostname.h" +#ifdef LEAKDETECT +#include "vld.h" // for memory leak detection +#endif +#include +#include +#include +#include +#include + #ifndef let #define let const auto #endif diff --git a/Source/Common/Include/DataTensor.h b/Source/Common/Include/DataTensor.h index 5423a43a0..5bfe4f410 100644 --- a/Source/Common/Include/DataTensor.h +++ b/Source/Common/Include/DataTensor.h @@ -107,24 +107,33 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable. - void Save(File& fstream) const + // verify that this refers to a dense matrix (no strides) + void VerifyIsDense() const { if (m_offset != 0) - LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices."); + LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int)m_offset); + ptrdiff_t mul = 1; + for (size_t k = 0; k < m_dims.size(); k++) // (TODO: we can save one multiplication here) + { + if (m_steps[k] != mul) + LogicError("TensorShape: A dense TensorShape expected. Dimension %d is not.", (int)k); + mul *= (ptrdiff_t)m_dims[k]; + } + } + + void Save(File& fstream) const + { + VerifyIsDense(); // saving as 32-bit ints. This allows to continue to support the old format (size_t W, H, C) fstream << (uint32_t)m_dims.size(); - ptrdiff_t mul = 1; - for (size_t k = 0; k < m_dims.size(); k++) + for (auto dim : m_dims) { - auto dim = m_dims[k]; if (dim > UINT32_MAX) LogicError("TensorShape::Save(): Tensor dimensions %s out of bounds (> 4G).", string(*this).c_str()); fstream << (uint32_t)dim; - if (m_steps[k] != mul) - LogicError("TensorShape::Save(): Cannot serialize TensorShape for slices."); - mul *= (ptrdiff_t)dim; } } + void Load(File& fstream) { // format: uint32_t n, dim[0], dim[1], ..., dim[n-1] @@ -182,6 +191,27 @@ namespace Microsoft { namespace MSR { namespace CNTK { return m_steps[k] == m_steps[k - 1] * (ptrdiff_t)m_dims[k - 1]; } + // editing functions + // These all create new TensorShape objects. + TensorShape Pad(size_t numDims) const // append singleton dimensions + { + VerifyIsDense(); + if (numDims < GetNumDims()) + LogicError("PadDims: Cannot drop a shorten the dimensions."); + else if (numDims == GetNumDims()) + return *this; + auto dims = GetDims(); + dims.resize(numDims, 1); + return TensorShape(dims); + } + TensorShape Concat(const TensorShape & other) const // concatenate + { + auto dims = GetDims(); + auto otherDims = other.GetDims(); + dims.insert(dims.end(), otherDims.begin(), otherDims.end()); + return TensorShape(dims); + } + // pretty-printing. Returns tensor dims in the form "I x J x K". operator std::string() const { diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index 5302e60c4..a104632f7 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -13,6 +13,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { + using namespace std; + // ----------------------------------------------------------------------- // subroutines for Validate() implementations // ----------------------------------------------------------------------- @@ -138,6 +140,61 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + // ----------------------------------------------------------------------- + // tensor helpers + // ----------------------------------------------------------------------- + + template + static TensorShape GetSampleShape(const ComputationNode * node) + { + // TODO: use actual ImageLayout. While those are not yet inferred properly, maybe use it if its dims match numRows? + if (node->HasMBLayout()) // if we have a layout, that dimension is not part of the sample shape + return TensorShape(node->GetNumRows()); + else + return TensorShape(node->GetNumRows(), node->GetNumCols()); + } + + template + std::vector> ComputationNode::GetTensorsForwardBinary(const FrameRange & fr) + { + const size_t N = 3; // 2 inputs and 1 output + // BUGBUG: Currently does not interpret actual ImageLayouts or convolutional models. + // TODO: move this into a helper function + // get tensor shapes + vector*> nodes; + for (size_t i = 0; i < N; i++) + nodes.push_back(i < N-1 ? Input(i).get() : this); + vector> values; + vector shapes; + for (size_t i = 0; i < N; i++) + { + values.push_back(nodes[i]->ValueFor(i < N-1 ? fr.AllowBroadcast() : fr)); // no broadcasting for now allowed for output + shapes.push_back(GetSampleShape(nodes[i])); + } + // pad + size_t dims = 0; + for (size_t i = 0; i < N; i++) + if (dims < shapes[i].GetNumDims()) + dims = shapes[i].GetNumDims(); + for (size_t i = 0; i < N; i++) + shapes[i] = shapes[i].Pad(dims); + // concatenate MBLayout dims + // TODO: Is it possible that the output has no layout, but inputs have? Then we lost dimensions. Tensor constructor will catch that, though. + if (HasMBLayout()) + { + for (size_t i = 0; i < N; i++) + { + auto sm = nodes[i]->HasMBLayout() ? TensorShape(GetNumParallelSequences(), GetNumTimeSteps()) : TensorShape(1, 1); + shapes[i] = shapes[i].Concat(sm); + } + } + // perform operation + std::vector> tensors; + for (size_t i = 0; i < N; i++) + tensors.push_back(TensorView(values[i], shapes[i])); + return tensors; + } + // ----------------------------------------------------------------------- // others // ----------------------------------------------------------------------- @@ -172,6 +229,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { template<> std::map> ComputationNode::s_constOnes{}; template<> std::map> ComputationNode::s_constOnes{}; + template class ComputationNode; + template class ComputationNode; + template class LearnableParameter; template class LearnableParameter; }}} diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index f71c26fd3..3dc6a3f20 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -779,7 +779,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { protected: //std containers such as list and map does not support class reference so we need to use pointer typedef shared_ptr> ComputationNodePtr; - ComputationNode() { } public: using ComputationNodeBase::AttachInputs; // import the convenience functions that take 1..6 parameters using ComputationNodeBase::SetDims; @@ -1085,6 +1084,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { const Matrix& Gradient() const { return *m_gradient; } Matrix& Gradient() { return *m_gradient; } + std::vector> GetTensorsForwardBinary(const FrameRange & fr); + // Function to return the number of columns for whole batch or single frame size_t GetNumColsFor(const FrameRange & fr/*select frame or entire batch*/) { diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h index b5624b003..79ad40f8e 100644 --- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h +++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h @@ -20,6 +20,7 @@ #include "Basics.h" #include "Matrix.h" +#include "TensorView.h" #include "ComputationNode.h" #include "ConvolutionalNodes.h" @@ -129,6 +130,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override { +#if 0 // TODO: use #if 0 until this is working + auto args = GetTensorsForwardBinary(fr); + args[2].DoSumOf(0.0f, args[0], args[1], 1.0f); +#else Matrix functionValues = ValueForToDense(fr, false); // Switch to dense as a work-around because ColumnSlice doesn't support all the sparse formats Matrix inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast()); Matrix inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast()); @@ -185,6 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } else LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str()); +#endif #if DUMPOUTPUT functionValues.Print("PlusNode"); #endif diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h index ceaa74d66..afd5d7d62 100644 --- a/Source/Math/CommonMatrix.h +++ b/Source/Math/CommonMatrix.h @@ -41,6 +41,28 @@ MATH_API DEVICEID_TYPE EnforceOneGPUOnly(DEVICEID_TYPE requestedDeviceId); namespace Microsoft { namespace MSR { namespace CNTK { + // ----------------------------------------------------------------------- + // ElementWiseOperator -- This enum represents which function to apply. + // This is shared between all matrix types and tensors. + // ----------------------------------------------------------------------- + + enum ElementWiseOperator + { + // binary + opSum, opDifference, opElementWiseProduct, opElementWiseQuotient, + opLogSum, opMax, opMin, + opEQ, opNE, opGT, opLT, opGE, opLE, + // unary (or binary with constant parameter) + opNegate, opNot, + opSaturate, opAbs, + opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine + // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode. + }; + + // ----------------------------------------------------------------------- + // various enums to describe + // ----------------------------------------------------------------------- + enum MatrixFlagBitPosition { bitPosRowMajor = 0, // row major matrix @@ -76,6 +98,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { matrixFlagSetValueOnDevice = 1< class BaseMatrix diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 421959f1a..6b16d3b63 100644 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -71,16 +71,6 @@ namespace Microsoft { }; - // ----------------------------------------------------------------------- - // ElementWiseOperator -- This enum represents which function to apply. It needs to be outside of GPUMatrix, because it is also used in GPUSparseMatrix - // ----------------------------------------------------------------------- - - enum ElementWiseOperator - { - opSigmoid = 0, opTanh, opSqrt, opExp, opLog, opAbs, opLinearRectifierDerivative, opCosine, opNegativeSine, opSigmoidDerivative - }; - - // ----------------------------------------------------------------------- // GPUMatrix // ----------------------------------------------------------------------- diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index bc5c75803..1c843c788 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -26,11 +26,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { // construction // ------------------------------------------------------------------- - // cast a matrix as a tensor + // cast a matrix as a TensorView template TensorView::TensorView(Matrix & sob) : - m_sob(sob), m_shape(TensorShape(array { sob.GetNumRows(), sob.GetNumCols() })) + m_sob(&sob), m_shape(TensorShape(array { sob.GetNumRows(), sob.GetNumCols() })) { } + // reshape a TensorView template TensorView::TensorView(const TensorView & other, const TensorShape & shape) : m_sob(other.m_sob), m_shape(shape) @@ -40,14 +41,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { // TODO: Use the multipliers instead? size_t i; size_t rowDim = 1; - for (i = 0; i < m_shape.size() && rowDim < m_sob.GetNumRows(); i++) + for (i = 0; i < m_shape.size() && rowDim < m_sob->GetNumRows(); i++) rowDim *= m_shape[i]; // first i dimensions match matrix row dimension size_t colDim = 1; for (; i < m_shape.size(); i++) colDim *= m_shape[i]; - if (rowDim != m_sob.GetNumRows() || colDim != m_sob.GetNumCols()) - LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob.GetNumRows(), (int)m_sob.GetNumCols()); + if (rowDim != m_sob->GetNumRows() || colDim != m_sob->GetNumCols()) + LogicError("TensorView: Tensor dimensions %s do not match storage-object dims %d x %d", string(m_shape).c_str(), (int)m_sob->GetNumRows(), (int)m_sob->GetNumCols()); } // ------------------------------------------------------------------- @@ -57,7 +58,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { static bool Matches(size_t d1, size_t d2) { return d1 == 1 || d2 == 1 || d1 == d2; } // do two dimensions match? template - void TensorView::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/) + void TensorView::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op) { TensorView & c = *this; @@ -110,6 +111,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { as[k] *= as[k - 1]; as[k - 1] = 1; bs[k] *= bs[k - 1]; bs[k - 1] = 1; cs[k] *= cs[k - 1]; cs[k - 1] = 1; + os[k] *= os[k - 1]; os[k - 1] = 1; // BUGBUG: Must update multipliers as well } diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h index a737e7746..1a8088a70 100644 --- a/Source/Math/TensorView.h +++ b/Source/Math/TensorView.h @@ -36,17 +36,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { { } // copy constructor TensorView(const TensorView & other) : - TensorView(other.m_sob, other.m_shape) + TensorView(*other.m_sob, other.m_shape) { } - // assignment is forbidden since we contain a reference - // If you ever need this, change the reference to a pointer. - void operator=(const TensorView & other) = delete; // since we have a reference // ------------------------------------------------------------------- // accessors // ------------------------------------------------------------------- - const Matrix & GetSOB() const { return m_sob; } + const Matrix & GetSOB() const { return *m_sob; } const TensorShape & GetShape() const { return m_shape; } // ------------------------------------------------------------------- @@ -59,19 +56,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs. // ------------------------------------------------------------------- - void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, 0); } + void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::opSum); } static void Test(); private: - void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, int op/*will become an enum later*/); + void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op); // ------------------------------------------------------------------- // sob members // ------------------------------------------------------------------- - Matrix & m_sob; // Storage OBject that holds the data that is being viewed with this TensorView + Matrix * m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable. TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern // TODO: use a reference here or not? With a reference, we can hide more info in here such as cuDNN handles }; From cd6543e46d20e02cb8954babc2e3e2beca14b086 Mon Sep 17 00:00:00 2001 From: yzhang87 Date: Thu, 17 Dec 2015 13:55:41 -0500 Subject: [PATCH 02/19] A quick fix to the Kaldi Reader (sequence training, need more test!!) --- .../Kaldi2Reader/UtteranceDerivativeBuffer.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp b/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp index 730a2b8d4..f845a4ca1 100644 --- a/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp +++ b/Source/Readers/Kaldi2Reader/UtteranceDerivativeBuffer.cpp @@ -32,22 +32,26 @@ namespace Microsoft { namespace MSR { namespace CNTK { assert(pMBLayout->GetNumParallelSequences() == m_numUttsPerMinibatch); uttInfoInMinibatch->clear(); uttInfoInMinibatch->resize(uttInfo.size()); + for (size_t i = 0; i < uttInfo.size(); ++i) { size_t startFrameIndexInMinibatch = 0; size_t numFrames = 0; + for (size_t j = 0; j < pMBLayout->GetNumTimeSteps(); ++j) { - if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel)) + /* if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel)) { continue; - } - if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoFeature)) + }*/ + FrameRange fr(pMBLayout,j); + + if (pMBLayout->IsGap(fr.Sequence(i))) { continue; } numFrames += 1; - if (pMBLayout->Is(i, j, MinibatchPackingFlags::SequenceEnd) + if (pMBLayout->IsBeyondStartOrEnd(fr.WithTimeOffset((ptrdiff_t) 1).Sequence(i)) || j == pMBLayout->GetNumTimeSteps() - 1) { size_t uttIndex = (*uttInfoInMinibatch)[i].size(); From bb6fc1bbe10c9a2db84b52d96b86551d7f0440cd Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 17 Dec 2015 11:33:52 -0800 Subject: [PATCH 03/19] optimized MBLayout::InitAsFrameMode(), short-replacing calls to AddSequence() by a much simpler direct initialization for this special case; added editing functions to TensorShape, and rewrote TensorView::DoBinaryOpOf() to use them --- Source/Common/Include/DataTensor.h | 140 ++++++++++++++---- Source/Common/Include/Sequences.h | 89 +++++++---- .../ComputationNetworkLib/ComputationNode.h | 2 +- Source/Math/Matrix.cpp | 1 + Source/Math/TensorView.cpp | 129 +++++++++------- Source/Math/TensorView.h | 4 +- 6 files changed, 243 insertions(+), 122 deletions(-) diff --git a/Source/Common/Include/DataTensor.h b/Source/Common/Include/DataTensor.h index 5bfe4f410..0152343d0 100644 --- a/Source/Common/Include/DataTensor.h +++ b/Source/Common/Include/DataTensor.h @@ -112,12 +112,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (m_offset != 0) LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int)m_offset); - ptrdiff_t mul = 1; for (size_t k = 0; k < m_dims.size(); k++) // (TODO: we can save one multiplication here) { - if (m_steps[k] != mul) + ptrdiff_t stride = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1; + if (m_strides[k] != stride) LogicError("TensorShape: A dense TensorShape expected. Dimension %d is not.", (int)k); - mul *= (ptrdiff_t)m_dims[k]; } } @@ -163,8 +162,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // accessors size_t GetDim(size_t k) const { return m_dims[k]; } size_t GetNumDims() const { return m_dims.size(); } - size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; } - ptrdiff_t GetStep(size_t k) const { return m_steps[k]; } + size_t GetNumElements() const { size_t res = 1; for (auto & dim : m_dims) res *= dim; return res; } // in slice size_t GetOffset() const { return m_offset; } // vector-like accessors @@ -172,12 +170,31 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t size() const { return GetNumDims(); } const std::vector & GetDims() const { return m_dims; } // get all, e.g. for logging or for constructing derived tensors with edited dimensions + const std::vector & GetStrides() const { return m_strides; } // interpretation as an image tensor size_t GetNumChannels() const { return m_dims[0]; } size_t GetWidth() const { return m_dims[1]; } size_t GetHeight() const { return m_dims[2]; } + // indexing + // Determines the offset into the underlying element array for a given multi-dimensional index. + // This function is for reference. Probably not often used. + size_t Locate(const std::vector & index) const + { + ptrdiff_t location = m_offset; + for (size_t k = 0; k < index.size(); k++) + { + size_t dim = k < size() ? m_dims[k] : 1; // dimensions are bottomless + if (index[k] >= dim) + LogicError("Locate: Tensor index[%d]=%d exceeds bound %d.", (int)k, (int)index[k], (int)dim); + location += (ptrdiff_t)index[k] * m_strides[k]; // strides may be negative + } + if (location < 0 || (size_t)location >= m_allocation) + LogicError("Locate: Tensor index out of bounds."); + return (size_t)location; + } + // helpers for tensor operations bool CanFlatten(size_t k) const // can dims k and k-1 be flattened into a single vector? (do they form a matrix without stride) { @@ -188,16 +205,71 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (m_dims[k] == 1 || m_dims[k - 1] == 1) // both are broadcasting or scalar--we don't care about stride in this case return true; else - return m_steps[k] == m_steps[k - 1] * (ptrdiff_t)m_dims[k - 1]; + return m_strides[k] == m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1]; } - // editing functions // These all create new TensorShape objects. + TensorShape Flatten(size_t k) const // flatten [k] with [k-1] + { + TensorShape result = *this; + if (!CanFlatten(k)) + LogicError("Flatten() cannot flatten dimensions with gaps"); + // We reshape local (I x J) sub-matrices to (1 x I*J) sub-matrices. + // We merge to right so that we can merge multiple by looping left-to-right. + // m_dims = I J K L + // m_strides = 1 I I*J I*J*K + // flattening J and K + // m_dims = I 1 J*K L + // m_strides = 1 I I I*J*K + // TODO: rethink whether this is correct for example of negative strides + result.m_dims[k] *= result.m_dims[k - 1]; + result.m_dims[k - 1] = 1; + result.m_strides[k] = /*result.m_dims[k - 1] *, it's 1 */ result.m_strides[k - 1]; + return result; + } + TensorShape DropSingletonDims(const std::vector & toDrop) const // flatten [k] with [k-1] if toFlatten[k] is set + { + TensorShape result = *this; + size_t j = 0; + for (size_t k = 0; k < size(); k++) + { + if (toDrop[k]) + { + if (result.m_dims[k] != 1) + LogicError("DeropSingletonDims() cannot drop non-singleton dimensions."); + else + continue; + } + else + { + // example + // m_dims = I 1 J K + // m_strides = 1 I I I*J + // dropping the second dimension + // m_dims = I % J K + // m_strides = 1 % I I*J + result.m_dims[j] = result.m_dims[k]; + result.m_strides[j] = result.m_strides[k]; + j++; + } + } + result.m_dims.resize(j); + result.m_strides.resize(j); + return result; + } + TensorShape WithBroadcastStrides() const // flatten [k] with [k-1] if toFlatten[k] is set + { + TensorShape result = *this; + for (size_t k = 0; k < size(); k++) + if (result.m_dims[k] == 1) + result.m_strides[k] = 0; + return result; + } TensorShape Pad(size_t numDims) const // append singleton dimensions { VerifyIsDense(); if (numDims < GetNumDims()) - LogicError("PadDims: Cannot drop a shorten the dimensions."); + LogicError("Pad() cannot drop a shorten the dimensions."); else if (numDims == GetNumDims()) return *this; auto dims = GetDims(); @@ -216,59 +288,65 @@ namespace Microsoft { namespace MSR { namespace CNTK { operator std::string() const { std::string s; - for (const auto & dim : m_dims) + for (size_t k = 0; k < size(); k++) { if (!s.empty()) s.append(" x "); - s.append(std::to_string(dim)); + s.append(std::to_string(m_dims[k])); } +#ifdef _DEBUG // also emit the strides, easier for debugging + s.append(" {"); + for (size_t k = 0; k < size(); k++) + { + if (k > 0) + s.append(","); + s.append(std::to_string(m_strides[k])); + } + s.append("}"); +#endif return s; } private: - // reset m_steps and m_offset to represent a canonical no-strides tensor + // reset m_strides and m_offset to represent a canonical no-strides tensor void InitAsNoSlice() { m_offset = 0; - m_steps.resize(m_dims.size()); - ptrdiff_t mul = 1; + m_strides.resize(m_dims.size()); for (size_t k = 0; k < m_dims.size(); k++) - { - m_steps[k] = (ptrdiff_t)mul; - mul *= m_dims[k]; - } + m_strides[k] = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1; + m_allocation = m_dims.empty() ? 0 : m_dims.back() * (size_t)m_strides.back(); } private: std::vector m_dims; // dimensions of tensor or tensor slice. The size of the box. - std::vector m_steps; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here! + std::vector m_strides; // dimension gets multiplied by this for computing the index offset. How to hop to the next element in dimension[k]. Stride magic happening here! size_t m_offset; // offset to element(0,0,...,0). May be non-0 in case of slicing. - // For a regular tensor, there are no strides, m_steps[k] = m_steps[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions. + size_t m_allocation; // allocation size of original dense tensor + // For a regular tensor, there are no strides, m_strides[k] = m_strides[k-1] * m_dims[k-1]. This is how TensorShapes are created from dimensions. // For views into existing tensors, we do stride shenanigans to implement broadcasting (plus magic tricks). Examples: // To traverse a 5 x 10 matrix with column order reversed: // - op.dims = (5 x 10) // - m_offset points to element (0,9) - // - m_steps[0] = 1 // regular forward iteration within each column - // - m_steps[1] = -5 // backward iteration over columns + // - m_strides = (1, -5) // backward iteration over columns // To compute matrix C(13 x 42) = vector A(13 x 1) + matrix B(13 x 42): // - op = sum // - op.dims = (13 x 42) - // - *.m_steps[0] = 1 // forward iteration through each column - // - C.m_steps[1] = 13 // forward iteration over columns of B--defines the for loop - // - B.m_steps[1] = 13 // forward iteration over columns of B--iterates in sync with C - // - A.m_steps[1] = 0 // A, however, is stuck in column 0 forever + // - C.m_strides = (1, 13) // forward iteration over columns of B--defines the for loop + // - B.m_strides = (1, 13) // forward iteration over columns of B--iterates in sync with C + // - A.m_strides = (1, 0) // A, however, is stuck in column 0 forever // Matrix product: C(I x K) = A(I x J) * B(J x K) --Note: Likely not RAM-bandwidth efficient! // - op = mul // - op.dims = (I x J x K) // iteration dimensions - // - C.m_steps = (1, 0, I) // inverse broadcasting for inner dimension - // - A.m_steps = (1, I, 0) - // - B.m_steps = (0, 1, J) + // - C.m_strides = (1, 0, I) // inverse broadcasting for inner dimension + // - A.m_strides = (1, I, 0) + // - B.m_strides = (0, 1, J) // Convolution of time signals (without padding): Y(T-N+1) = X(T) * H(N): --Note: Likely not RAM-bandwidth efficient! // - op = mul // - op.dims = (T-N+1 x N) // iteration dimensions - // - Y.m_steps = (1, 0) // inverse broadcasting: this sums up the individual products - // - X.m_steps = (1, 1) // shift window by 1 for each output sample - // - H.m_steps = (0, -1) // reuse for each output sample; iterate in reverse order for convolution + // - Y.m_strides = (1, 0) // inverse broadcasting: this sums up the individual products + // - X.m_strides = (1, 1) // shift window by 1 for each output sample + // - H.m_strides = (0, -1) // reuse for each output sample; iterate in reverse order for convolution // - H.m_offset = N - 1 // begin with last element (reverse order for convolution) // TODO: double-check all these // TODO: Does the same trick work for 2D images? diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 95f9a8e38..15484458f 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -108,12 +108,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_numParallelSequences = numParallelSequences; m_numTimeSteps = numTimeSteps; // allocate lookup tables (note: except at the start, these don't really allocate new memory most of the time) - // PTRDIFF_MAX indicates not initialized (also in the matrix, which is stored as float). - m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToStart.SetValue((float)PTRDIFF_MAX); - m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToEnd.SetValue((float)PTRDIFF_MAX); - m_distanceToNearestStart.assign(m_numTimeSteps, PTRDIFF_MAX); - m_distanceToNearestEnd.assign(m_numTimeSteps, PTRDIFF_MAX); +#if 1 + if (m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps) // sanity check for debugging a regression + fprintf(stderr, "MBLayout::Init: Resizing m_distanceToStart from %d x %d to %d x %d\n", + (int)m_distanceToStart.GetNumRows(), (int)m_distanceToStart.GetNumCols(), (int)m_numParallelSequences, (int)m_numTimeSteps); // (I really want to know about actual allocations, but this is a necessary condition for them) +#endif + m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps); + m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps); + m_distanceToNearestStart.assign(m_numTimeSteps, SIZE_MAX); + m_distanceToNearestEnd.assign(m_numTimeSteps, SIZE_MAX); m_timeStepHasGap.assign(m_numTimeSteps, false); + m_columnsValidityMask.Resize(0, 0); // invalidate // reset state m_numFramesDeclared = 0; m_numGapFrames = 0; @@ -121,20 +126,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_writable = true; } - // short-hand to initialize an MBLayout for the common case of frame mode - // In frame mode, there is one parallel "sequence" per sample, which is 1 frame long. - void InitAsFrameMode(size_t numSamples) - { - Init(numSamples, 1); - SequenceInfo seqInfo { 0, 0, 0, 1 }; - for (size_t s = 0; s < numSamples; s++) - { - seqInfo.seqId = seqInfo.s = s; - AddSequence(seqInfo); - } - Lock(); - } - // ------------------------------------------------------------------- // accessors // ------------------------------------------------------------------- @@ -199,7 +190,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch."); // remember it +#if 1 + auto cap = m_sequences.capacity(); // some sanity check for debugging a speed regression m_sequences.push_back(seqDesc); + if (cap != m_sequences.capacity()) + fprintf(stderr, "AddSequence: m_sequences was reallocated from capacity %d to %d\n", (int)cap, (int)m_sequences.capacity()); +#else + m_sequences.push_back(seqDesc); +#endif // create all the cached fast-lookup information const auto seqId = seqDesc.seqId; @@ -212,7 +210,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_numGapFrames += (e - b); for (size_t t = b; t < e; t++) { - //Set(s, t, MinibatchPackingFlags::NoInput); m_timeStepHasGap[t] = true; m_distanceToStart(s, t) = -1; // start flags also encode gaps } @@ -220,22 +217,49 @@ namespace Microsoft { namespace MSR { namespace CNTK { else for (size_t t = b; t < e; t++) { // update the nearest sentence boundaries, minimum over all parallel sequences - // -1 in distanceToStart(,) stands for a gap - assert(m_distanceToStart(s, t) != -1); // gaps not allowed to overlap // If 0, then we are on a boundary. If not 0, we can still test in presence of FrameRange.m_timeOffset. - ptrdiff_t distanceToStart = t - beginTime; - if (m_distanceToStart(s, t) > (float)distanceToStart) - m_distanceToStart(s, t) = (float)distanceToStart; + size_t distanceToStart = (size_t)((ptrdiff_t)t - beginTime); + size_t distanceToEnd = endTime - 1 - t; + m_distanceToStart(s, t) = (float)distanceToStart; + m_distanceToEnd(s, t) = (float)distanceToEnd; + // and the aggregate if (m_distanceToNearestStart[t] > distanceToStart) m_distanceToNearestStart[t] = distanceToStart; - ptrdiff_t distanceToEnd = endTime - 1 - t; - if (m_distanceToEnd(s, t) > (float) distanceToEnd) - m_distanceToEnd(s, t) = (float) distanceToEnd; if (m_distanceToNearestEnd[t] > distanceToEnd) m_distanceToNearestEnd[t] = distanceToEnd; } } + // short-hand to initialize an MBLayout for the common case of frame mode + // In frame mode, there is one parallel "sequence" per sample, which is 1 frame long. + // This function provides an efficient short-cut implementation of AddSequence(t, t, 0, 1) for every sample t. + void InitAsFrameMode(size_t numSamples) + { + Init(numSamples, 1); + + // create sequences array + SequenceInfo virginSeqInfo = { 0, 0, 0, 1 }; + m_sequences.resize(numSamples, virginSeqInfo); // pass it here since otherwise STL will initialize everything to 0 unnecessarily + + // update sequence indices + for (size_t s = 0; s < numSamples; s++) + { + // remember it + auto & seqDesc = m_sequences[s]; + seqDesc.seqId = s; + seqDesc.s = s; + } + m_numFramesDeclared = numSamples; + + // create all the cached fast-lookup information + m_distanceToStart.SetValue(0); + m_distanceToEnd.SetValue(0); + m_distanceToNearestStart[0] = 0; + m_distanceToNearestEnd[0] = 0; + + Lock(); + } + // mark a range of frames in a parallel sequence as invalid // I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them. void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); } @@ -330,10 +354,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { // 2 1 0 . . ] // (last two time steps undefined) // m_distanceToNearestStart = [ 0 1 2 3 4 ] // m_distanceToNearestEnd = [ 2 1 0 1 0 ] - Matrix m_distanceToStart, m_distanceToEnd; // (s,t); value<0 stands for gap, PTRDIFF_MAX for 'not initialized' - vector m_distanceToNearestStart, m_distanceToNearestEnd; // [t] (value<0 does NOT stand for gap; consult m_timeStepHasGap[] vector instead) + Matrix m_distanceToStart, m_distanceToEnd; // (s,t); value<0 stands for gap + vector m_distanceToNearestStart, m_distanceToNearestEnd; // [t] (does not store info about gaps; consult m_timeStepHasGap[] vector instead) - vector m_timeStepHasGap; // [t] + vector m_timeStepHasGap; // [t] true if at least one gap in time step t // Cached mask indicating the validity of each column in the MBLayout // TODO: We actually just need a boolean matrix for this. @@ -527,6 +551,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (s == SIZE_MAX) // aggregate requested { // determine flags from aggregate vectors + assert(m_distanceToNearestStart[t] != SIZE_MAX); // (sanity check) auto distanceToStart = (ptrdiff_t)m_distanceToNearestStart[t]; if (distanceToStart < -fr.m_timeOffset) return true; @@ -557,7 +582,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // TODO: Remove this version (with sanity checks) after this has been tested. Then the function can be inlined above. inline size_t MBLayout::GetActualNumSamples() const { -#if 1 // sanity check --TODO: delete this after a while +#if 0 // sanity check --TODO: delete this after a while size_t n = GetNumCols(); if (HasGaps()) { diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 3dc6a3f20..7f89060a6 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -1520,7 +1520,7 @@ protected: \ using Base::CreateUniqId; \ using Base::GetNumInputs; using Base::ZeroGradientsOfInputs; using Base::VerifyDims; \ using Base::ConstOnes; \ - using Base::GetImageLayout; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \ + using Base::GetImageLayout; using Base::GetTensorsForwardBinary; using Base::InferImageDimsFromInput; using Base::InferImageDimsFromInputs; using Base::InferMBLayoutFromInputsForStandardCase; \ using Base::CopyTo; using Base::CreateUniqNodeName; using Base::DetachInputs; using Base::GetInputsFromConfig; \ using Base::DumpNodeInfo; using Base::EnumerateNodes; \ using Base::HasMBLayout; using Base::GetMBLayout; using Base::LinkToMBLayout; \ diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 7f644687e..0bd30e22b 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -5205,5 +5205,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { template void Matrix::SetValue(const char); template void Matrix::SetValue(size_t numRows, const size_t numCols, int deviceId, char *pArray, size_t matrixFlags); template bool Matrix::IsEmpty() const; + template void Matrix::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly); }}} diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index 1c843c788..21fab4559 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -60,92 +60,109 @@ namespace Microsoft { namespace MSR { namespace CNTK { template void TensorView::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op) { +#define N 3 // later make this a template parameter. N=1 is possible for generators, such as constants. + array shapes; TensorView & c = *this; - // TODO: Turn the inner meat here into a function template using a std::array<., N-nariness>. Nullary ops are generators, e.g. constants. + shapes[0] = a.GetShape(); + shapes[1] = b.GetShape(); + shapes[2] = c.GetShape(); // last one is the output // massage TensorShapes // Note that TensorShapes here may be shapes are stored or shapes with stride magic applied. - auto as = a.GetShape().GetDims(); - auto bs = b.GetShape().GetDims(); - auto cs = c.GetShape().GetDims(); // expand ones to make tensors compatible // Trailing dimensions broadcast. // E.g. A(J) vs. B(J x T) will broadcast A(:) to all T columns. // To broadcast an A(T) to all J rows of B, use TensorShape editing to insert a dimension to get A(1,T). - auto dims = max(max(as.size(), bs.size()), cs.size()); - as.resize(dims, 1); - bs.resize(dims, 1); - cs.resize(dims, 1); + size_t dims = 0; + for (size_t i = 0; i < N; i++) + if (dims < shapes[i].GetNumDims()) + dims = shapes[i].GetNumDims(); + for (size_t i = 0; i < N; i++) + shapes[i] = shapes[i].Pad(dims); // determine operation shape (max over all dimensions) - decltype(as) os(dims); + vector opDims(dims, 0); for (size_t k = 0; k < dims; k++) - os[k] = max(max(as[k], bs[k]), cs[k]); + for (size_t i = 0; i < N; i++) + opDims[k] = max(opDims[k], shapes[i][k]); // dimension compatibility check // Each participant can broadcast. Non-broadcasting dimensions must match the operation dimension. for (size_t k = 0; k < dims; k++) - { - if (!Matches(as[k], os[k]) || !Matches(bs[k], os[k]) || !Matches(cs[k], os[k])) - InvalidArgument("Binary tensor operation: Dimension %d is incompatible between the two inputs and output (%d vs. %d vs. %d)", (int)dims, (int)as[k], (int)bs[k], (int)cs[k]); - } + for (size_t i = 0; i < N; i++) + if (!Matches(shapes[i][k], opDims[k])) + InvalidArgument("Binary tensor operation: Dimension %d is incompatible between input %d and output (%s vs. %s)", (int)k, (int)shapes[i][k], string(shapes[i]).c_str(), string(TensorShape(opDims)).c_str()); // flatten consecutive dimensions // Dimensions must be consecutive in memory, and either non-broadcasting or all-broadcasting, across all dimensions. // After this, as, bs, and cs no longer match the TensorShape objects. + fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); for (size_t k = 1; k < dims; k++) { - // check if stored without gaps to skip - if (!a.GetShape().CanFlatten(k) || !b.GetShape().CanFlatten(k) || !c.GetShape().CanFlatten(k)) - continue; - // check if they are either all broadcasting or all not broadcasting - if ((as[k] != os[k] || as[k - 1] != os[k - 1]) && (as[k] != 1 || as[k - 1] != 1)) - continue; - if ((bs[k] != os[k] || bs[k - 1] != os[k - 1]) && (bs[k] != 1 || bs[k - 1] != 1)) - continue; - if ((cs[k] != os[k] || cs[k - 1] != os[k - 1]) && (cs[k] != 1 || cs[k - 1] != 1)) - continue; - // merge the dimensions - as[k] *= as[k - 1]; as[k - 1] = 1; - bs[k] *= bs[k - 1]; bs[k - 1] = 1; - cs[k] *= cs[k - 1]; cs[k - 1] = 1; - os[k] *= os[k - 1]; os[k - 1] = 1; - // BUGBUG: Must update multipliers as well + for (size_t i = 0; i < N; i++) + { + // check if stored without gaps to skip + if (!shapes[i].CanFlatten(k)) + goto nope; + // check if they are either all broadcasting or all not broadcasting + if ((shapes[i][k] != opDims[k] || shapes[i][k - 1] != opDims[k - 1]) && (shapes[i][k] != 1 || shapes[i][k - 1] != 1)) + goto nope; + } + // these dimensions can be merged + for (size_t i = 0; i < N; i++) + shapes[i] = shapes[i].Flatten(k); // TODO: overdoing the immutable thingy much? + opDims = TensorShape(opDims).Flatten(k).GetDims(); // (ugh) + nope:; } + fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); // remove singleton dimensions - size_t j = 0; + vector toDrop(dims, false); for (size_t k = 0; k < dims; k++) { - if (as[k] == 1 && bs[k] == 1 && cs[k] == 1) // skip all-singleton dimensions - continue; - as[j] = as[k]; - bs[j] = bs[k]; - cs[j] = cs[k]; - os[j] = os[k]; - j++; + for (size_t i = 0; i < N; i++) + if (shapes[i][k] != 1) + goto neither; + toDrop[k] = true; // found an all-singleton dimensions + neither:; } - // note: if op is a scalar, then we end up with 0 dimensions here - dims = j; - as.resize(dims); - bs.resize(dims); - cs.resize(dims); - os.resize(dims); - let as1 = TensorShape(as); // BUGBUG: We just lost stride info. - let bs1 = TensorShape(bs); - let cs1 = TensorShape(cs); - let os1 = TensorShape(os); + for (size_t i = 0; i < N; i++) + shapes[i] = shapes[i].DropSingletonDims(toDrop); + opDims = TensorShape(opDims).DropSingletonDims(toDrop).GetDims(); // (ugh) + // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed + fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); + + // determine broadcasting; that is, set strides to 0 for 1-dimensions + // To be more precise, we should only set actually broadcasting dimensions to 0. + // But since dimensions that are 1 across all args are eliminated, any 1 must be some form of broadcasting. + // TODO: Do we need to allow other strides at this point in time? If not, broadcasting becomes a bit vector. + for (size_t i = 0; i < N; i++) + shapes[i] = shapes[i].WithBroadcastStrides(); // determine inverse broadcasting dimensions - // TODO: describe the resulting for loop as a set of tensor dims and strides as well. - vector cBroadcasts(dims); - for (size_t k = 0; k < dims; k++) - cBroadcasts[k] = cs1[k] == 1 && (as1[k] != 1 || bs1[k] != 1); + // Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index. + // For regular input dims: + // - determine number of steps (product over opDims[.]) + // - launch that many kernels + // - pass in: + // - total number of steps + // - strides for all inputs (with stride magic), separated by regular and inverse broadcasting dimensions + // - opDim (no stride magic allowed) for regular broadcasting dimensions + // - reverse broadcasting dimensions + // - opcodes for elementwise op and reduction op + // - in each kernel: + // - map thread index to dimensions (regular broadcasting ones) + // - for-loop over inverse broadcasting dimensions + // - map dimensions (including inverse broadcasting) for every input + // - perform op on the input values + // - accumulate + // - map dimensions (regular) for output + // - save result // now perform the operation - fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(as1).c_str(), string(bs1).c_str(), string(cs1).c_str(), string(os1).c_str()); + fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); // :) beta; alpha; } @@ -155,9 +172,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { template /*static*/ void TensorView::Test() { - Matrix m1(0); m1.Resize(1, 42); - Matrix m2(0); m2.Resize(13, 1); - Matrix m3(0); m3.Resize(13, 21); + Matrix m1(-1); m1.Resize(1, 42); + Matrix m2(-1); m2.Resize(13, 1); + Matrix m3(-1); m3.Resize(13, 21); TensorShape s1(1, 2, 21); TensorShape s2(13, 1); TensorShape s3(13, 1, 21); diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h index 1a8088a70..be037fa5b 100644 --- a/Source/Math/TensorView.h +++ b/Source/Math/TensorView.h @@ -68,8 +68,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // sob members // ------------------------------------------------------------------- - Matrix * m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable. - TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern + Matrix * m_sob; // Storage OBject that holds the data that is being viewed with this TensorView. Pointer instead of ref so this object is copyable. + TensorShape m_shape; // the meta-data that describes the data's shape and/or access pattern // TODO: use a reference here or not? With a reference, we can hide more info in here such as cuDNN handles }; From e6040d050dbbe11b937e2222eeea4d335a562731 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 17 Dec 2015 11:35:28 -0800 Subject: [PATCH 04/19] made Linux build happy (missing explicit method template specialization of CPUMatrix::Resize()) --- Source/Math/CPUMatrix.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index ba1bc077d..937f642c5 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -5551,5 +5551,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { template void CPUMatrix::SetValue(const char); template void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, char *pArray, size_t matrixFlags); template void CPUMatrix::SetValue(CPUMatrix const&); + template void CPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly); }}} From aa5d1a7213880b4b00eafc1bb09002fbfdc4b08b Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 17 Dec 2015 15:50:00 -0800 Subject: [PATCH 05/19] implemented plumbing and first shot for TensorView operation with reduction --- Source/Math/CPUMatrix.cpp | 137 +++++++++++++++++ Source/Math/CPUMatrix.h | 5 + Source/Math/Matrix.cpp | 301 +++++++++++++++++++------------------ Source/Math/Matrix.h | 7 +- Source/Math/TensorView.cpp | 35 ++++- Source/Math/TensorView.h | 2 +- 6 files changed, 334 insertions(+), 153 deletions(-) diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 937f642c5..092cd8c9e 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -5533,6 +5533,143 @@ namespace Microsoft { namespace MSR { namespace CNTK { return numThreads; } + // ----------------------------------------------------------------------- + // TensorView support + // ----------------------------------------------------------------------- + + // perform loop over reduction index m + // This function is declared inside a wrapper struct to allow partial specialization (m = -1). + template + struct TensorOpReduction + { + // reduction case (non-reduction case is specialized) + static inline ElemType Loop(array pointers, const OPFN & opfn, + const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + { + array strides; + for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled + strides[i] = reducingStrides[i][(size_t)m]; + ElemType aggregate = 0; + for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;) + { + // need to descend into one loop deeper + aggregate += TensorOpReduction::Loop(pointers, opfn, reducingOpDims, reducingStrides); + // advance the pointers + for (size_t i = 0; i < N; i++) + pointers[i] += strides[i]; + } + return aggregate; + } + }; + + // perform loop over reduction index m + // This is the specialized version for m = -1, which terminates the recursion. + template + struct TensorOpReduction + { + static inline ElemType Loop(array pointers, const OPFN & opfn, + const std::vector &, const std::array, N> &) + { + return opfn(pointers); // finally we are doing some work!!! + } + }; + + // perform loop over regular index k and reducing index m for N operands (counting the output) + template + struct TensorOpIteration + { + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, + const std::vector & regularOpDims, const std::array, N> & regularStrides, + const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + { + // non-scalar case: still nested result loops left + array strides; + for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled + strides[i] = regularStrides[i][(size_t)k]; + for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;) + { + // need to descend into one loop deeper + TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + // advance the pointers + for (size_t i = 0; i < N; i++) + pointers[i] += strides[i]; + } + } + }; + + template + struct TensorOpIteration + { + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, + const std::vector &, const std::array, N> &, + const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + { + // we are at element level for the result: perform the op (there may still be reduction) + ElemType val = alpha * TensorOpReduction::Loop(pointers, opfn, reducingOpDims, reducingStrides); + // combine with previous value in target matrix, then write it out + auto * pout = pointers.back(); + if (beta != 0) + val += beta * *pout; + *pout = val; + return; + } + }; + + // tensor operation with k+1 dimensions (-1 means scalar) + template + static inline void TensorOpWithRegularLoop(ElemType beta, const array & pointers, ElemType alpha, const OPFN & opfn, + const std::vector & regularOpDims, const std::array, N> & regularStrides, + const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + { + size_t dims = regularOpDims.size(); + switch (dims) + { + case 2: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 1: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 0: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims); + } + } + + // tensor operation, generalized in number of arguments, operation already provided as a lambda + // This function now expands into different k. + template + static inline void TensorOpWithFn(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, N> & regularStrides, + const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + { + for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled + pointers[i] += offsets[i]; + size_t dims = regularOpDims.size(); + switch (dims) + { + case 4: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 3: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 2: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 1: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 0: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims); + } + } + + // perform binary operation 'op' on a and b giving c, reinterpreting the matrices as tensors as specified by the dims and strides + template + void CPUMatrix::TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 3> & regularStrides, + const std::vector & reducingOpDims, const std::array, 3> & reducingStrides) + { + array pointers = { a.m_pArray, b.m_pArray, m_pArray }; + switch (op) + { + case ElementWiseOperator::opSum: + return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return *(pp[0]) + *(pp[1]); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + default: + LogicError("TensorNnaryOp: Unknown op code %d.", (int)op); + } + } + // The explicit instantiation part template class MATH_API CPUMatrix; template class MATH_API CPUMatrix; diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 83d63559b..6128204c4 100644 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -334,6 +334,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { static bool AreEqual(const CPUMatrix& a, const CPUMatrix& b, const ElemType threshold = 1e-8); static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix& b, CPUMatrix& c); + + void TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 3> & regularStrides, + const std::vector & reducingOpDims, const std::array, 3> & reducingStrides); static CPUMatrix Ones(const size_t rows, const size_t cols); static CPUMatrix Zeros(const size_t rows, const size_t cols); diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 0bd30e22b..d49caee4e 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -4794,7 +4794,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - + template bool Matrix::HasElement(const Matrix& a, const ElemType value) { @@ -4936,148 +4936,144 @@ namespace Microsoft { namespace MSR { namespace CNTK { return *this; } - template - Matrix& Matrix::AssignElementProductOfWithShiftNeg(const Matrix& a, const Matrix& b, size_t shift, size_t negnumber) - { - if (a.IsEmpty() || b.IsEmpty()) - LogicError("AssignElementProductOfWithShiftNeg: Matrix is empty."); + template + Matrix& Matrix::AssignElementProductOfWithShiftNeg(const Matrix& a, const Matrix& b, size_t shift, size_t negnumber) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AssignElementProductOfWithShiftNeg: Matrix is empty."); - assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) - InvalidArgument("The input matrix dimensions do not match."); + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + InvalidArgument("The input matrix dimensions do not match."); - if (a.GetNumRows() != 1) - InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector."); + if (a.GetNumRows() != 1) + InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector."); - DecideAndMoveToRightDevice(a, b, *this); - if (!(a.GetMatrixType() == b.GetMatrixType())) - NOT_IMPLEMENTED; + DecideAndMoveToRightDevice(a, b, *this); + if (!(a.GetMatrixType() == b.GetMatrixType())) + NOT_IMPLEMENTED; this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); - DISPATCH_MATRIX_ON_FLAG(this, - this, - this->m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber), - this->m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED - ); - return *this; - } + DISPATCH_MATRIX_ON_FLAG(this, + this, + this->m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber), + this->m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + return *this; + } + template + Matrix& Matrix::AssignInnerProductOfWithShiftNeg(const Matrix& a, const Matrix& b, const bool isColWise, size_t shift, size_t negnumber) + { + InnerProductWithShiftNeg(a, b, *this, isColWise, shift, negnumber); + return *this; + } - template - Matrix& Matrix::AssignInnerProductOfWithShiftNeg(const Matrix& a, const Matrix& b, const bool isColWise, size_t shift, size_t negnumber) - { - InnerProductWithShiftNeg(a, b, *this, isColWise, shift, negnumber); - return *this; - } - template - void Matrix::InnerProductWithShiftNeg(const Matrix& a, const Matrix& b, Matrix& c, const bool isColWise, size_t shift, size_t negnumber) - { - if (a.IsEmpty() || b.IsEmpty()) - LogicError("InnerProduct: one of the input matrix is empty."); + template + void Matrix::InnerProductWithShiftNeg(const Matrix& a, const Matrix& b, Matrix& c, const bool isColWise, size_t shift, size_t negnumber) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("InnerProduct: one of the input matrix is empty."); - DecideAndMoveToRightDevice(a, b, c); + DecideAndMoveToRightDevice(a, b, c); - if (a.GetMatrixType() != b.GetMatrixType()) - NOT_IMPLEMENTED; + if (a.GetMatrixType() != b.GetMatrixType()) + NOT_IMPLEMENTED; c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); - DISPATCH_MATRIX_ON_FLAG(&c, - &c, - CPUMatrix::InnerProductWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, isColWise, shift, negnumber), - GPUMatrix::InnerProductWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, negnumber), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED - ); + DISPATCH_MATRIX_ON_FLAG(&c, + &c, + CPUMatrix::InnerProductWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, isColWise, shift, negnumber), + GPUMatrix::InnerProductWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, negnumber), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + } - } + template + Matrix& Matrix::GetARowByIndex(const Matrix& a, size_t index) + { + if (a.IsEmpty()) + LogicError("GetARowByIndex: Matrix is empty."); - template - Matrix& Matrix::GetARowByIndex(const Matrix& a, size_t index) - { - if (a.IsEmpty()) - LogicError("GetARowByIndex: Matrix is empty."); - - - //WARNING: a and this must have same type - if (!(GetMatrixType() == a.GetMatrixType())) - NOT_IMPLEMENTED; + //WARNING: a and this must have same type + if (!(GetMatrixType() == a.GetMatrixType())) + NOT_IMPLEMENTED; SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); - DISPATCH_MATRIX_ON_FLAG(this, - this, - this->m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index), - this->m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED - ); + DISPATCH_MATRIX_ON_FLAG(this, + this, + this->m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index), + this->m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); - return *this; - } + return *this; + } - template - void Matrix::ConductRowElementMultiplyWithShift(const Matrix& a, const Matrix& b, Matrix& c, size_t shift, bool bFirstmatrixfixed) - { - if (a.IsEmpty() || b.IsEmpty()) - LogicError("InnerProduct: one of the input matrix is empty."); + template + void Matrix::ConductRowElementMultiplyWithShift(const Matrix& a, const Matrix& b, Matrix& c, size_t shift, bool bFirstmatrixfixed) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("InnerProduct: one of the input matrix is empty."); - DecideAndMoveToRightDevice(a, b, c); + DecideAndMoveToRightDevice(a, b, c); - if (a.GetMatrixType() != b.GetMatrixType()) - NOT_IMPLEMENTED; + if (a.GetMatrixType() != b.GetMatrixType()) + NOT_IMPLEMENTED; c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); - DISPATCH_MATRIX_ON_FLAG(&c, - &c, - CPUMatrix::ConductRowElementMultiplyWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, shift, bFirstmatrixfixed), - GPUMatrix::ConductRowElementMultiplyWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, bFirstmatrixfixed), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED - ); + DISPATCH_MATRIX_ON_FLAG(&c, + &c, + CPUMatrix::ConductRowElementMultiplyWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, shift, bFirstmatrixfixed), + GPUMatrix::ConductRowElementMultiplyWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, bFirstmatrixfixed), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + } - } + template + Matrix& Matrix::AssignElementProductOfWithShift(const Matrix& a, const Matrix& b, size_t shift) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AssignElementProductOfWithShift: Matrix is empty."); - template - Matrix& Matrix::AssignElementProductOfWithShift(const Matrix& a, const Matrix& b, size_t shift) - { - if (a.IsEmpty() || b.IsEmpty()) - LogicError("AssignElementProductOfWithShift: Matrix is empty."); + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + InvalidArgument("The input matrix dimensions do not match."); - assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); - if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) - InvalidArgument("The input matrix dimensions do not match."); + if (a.GetNumRows() != 1) + InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector."); - if (a.GetNumRows() != 1) - InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector."); - - DecideAndMoveToRightDevice(a, b, *this); - if (!(a.GetMatrixType() == b.GetMatrixType())) - NOT_IMPLEMENTED; + DecideAndMoveToRightDevice(a, b, *this); + if (!(a.GetMatrixType() == b.GetMatrixType())) + NOT_IMPLEMENTED; this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); - DISPATCH_MATRIX_ON_FLAG(this, - this, - this->m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift), - this->m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED - ); - return *this; - } - + DISPATCH_MATRIX_ON_FLAG(this, + this, + this->m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift), + this->m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + return *this; + } template void Matrix::RCRFBackwardCompute(const Matrix& alpha, Matrix& beta, - Matrix& functionValues, const Matrix& lbls, - const Matrix& pos_scores, const Matrix& pair_scores, const int shift) + Matrix& functionValues, const Matrix& lbls, + const Matrix& pos_scores, const Matrix& pair_scores, const int shift) { DecideAndMoveToRightDevice(alpha, beta); functionValues._transferToDevice(alpha.GetDeviceId()); @@ -5134,55 +5130,70 @@ namespace Microsoft { namespace MSR { namespace CNTK { ); } - template - Matrix& Matrix::DropFrame(const Matrix& label, const Matrix& gamma, const ElemType & threshhold) - { - DecideAndMoveToRightDevice(*this, label, gamma); + template + Matrix& Matrix::DropFrame(const Matrix& label, const Matrix& gamma, const ElemType & threshhold) + { + DecideAndMoveToRightDevice(*this, label, gamma); - if (label.GetNumCols() != gamma.GetNumCols() || label.GetNumRows() != gamma.GetNumRows()) - LogicError("DropFrame: label matrix is not in the same size as gamm matrix."); - this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false); + if (label.GetNumCols() != gamma.GetNumCols() || label.GetNumRows() != gamma.GetNumRows()) + LogicError("DropFrame: label matrix is not in the same size as gamm matrix."); + this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false); - DISPATCH_MATRIX_ON_FLAG(this, - this, - this->m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold), - this->m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED - ); + DISPATCH_MATRIX_ON_FLAG(this, + this, + this->m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold), + this->m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); - return *this; - } + return *this; + } - /// c = alpha * (a-b) - /// if a, b, c must have same dim - /// Scalar - /// Input matrix - /// Input matrix - /// Resulting matrix, user is responsible for allocating this - template - Matrix& Matrix::AssignSequenceError(const ElemType hsmoothingWeight, const Matrix& label, - const Matrix& dnnoutput, const Matrix& gamma, ElemType alpha) - { - DecideAndMoveToRightDevice(label, dnnoutput, gamma); + /// c = alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + Matrix& Matrix::AssignSequenceError(const ElemType hsmoothingWeight, const Matrix& label, + const Matrix& dnnoutput, const Matrix& gamma, ElemType alpha) + { + DecideAndMoveToRightDevice(label, dnnoutput, gamma); - if (!(label.GetMatrixType() == gamma.GetMatrixType())) - NOT_IMPLEMENTED; + if (!(label.GetMatrixType() == gamma.GetMatrixType())) + NOT_IMPLEMENTED; - this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false); + this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false); - DISPATCH_MATRIX_ON_FLAG(this, - this, - this->m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha), - this->m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED - ); - return *this; - } + DISPATCH_MATRIX_ON_FLAG(this, + this, + this->m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha), + this->m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + return *this; + } #pragma endregion Static BLAS Functions + template + void Matrix::TensorOp(ElemType beta, const Matrix& a, const Matrix& b, ElemType alpha, ElementWiseOperator op, + const array & offsets, + const vector & regularOpDims, const array, 3> & regularStrides, + const vector & reducingOpDims, const array, 3> & reducingStrides) + { + DISPATCH_MATRIX_ON_FLAG(this, + this, + m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides), + NOT_IMPLEMENTED, //m_GPUMatrix->TensorOp(beta, offsets, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + } + template class Matrix; template class Matrix; diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index f2d00cf84..0a6c488c4 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -16,11 +16,11 @@ #include "CommonMatrix.h" #include #include // for shared_ptr +#include // This class is exported from the Math.dll namespace Microsoft { namespace MSR { namespace CNTK { - enum CurrentDataLocation { NONE, CPU, GPU, BOTH @@ -458,6 +458,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { static bool HasElement(const Matrix& a, const ElemType value = 0.0); static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix& b, Matrix& c); + + void TensorOp(ElemType beta, const Matrix& a, const Matrix& b, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 3> & regularStrides, + const std::vector & reducingOpDims, const std::array, 3> & reducingStrides); public: void Read(File& stream); void Write(File& stream) const; diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index 21fab4559..2a64f3e64 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -98,7 +98,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // flatten consecutive dimensions // Dimensions must be consecutive in memory, and either non-broadcasting or all-broadcasting, across all dimensions. // After this, as, bs, and cs no longer match the TensorShape objects. - fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); + //fprintf(stderr, "Pre-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); for (size_t k = 1; k < dims; k++) { for (size_t i = 0; i < N; i++) @@ -116,7 +116,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { opDims = TensorShape(opDims).Flatten(k).GetDims(); // (ugh) nope:; } - fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); + //fprintf(stderr, "Post-flatten: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); // remove singleton dimensions vector toDrop(dims, false); @@ -132,7 +132,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { shapes[i] = shapes[i].DropSingletonDims(toDrop); opDims = TensorShape(opDims).DropSingletonDims(toDrop).GetDims(); // (ugh) // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed - fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); + //fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); // determine broadcasting; that is, set strides to 0 for 1-dimensions // To be more precise, we should only set actually broadcasting dimensions to 0. @@ -141,6 +141,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (size_t i = 0; i < N; i++) shapes[i] = shapes[i].WithBroadcastStrides(); + fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); + // determine inverse broadcasting dimensions // Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index. // For regular input dims: @@ -161,10 +163,31 @@ namespace Microsoft { namespace MSR { namespace CNTK { // - map dimensions (regular) for output // - save result + // separate out the inverse-broadcasting dimensions + // Any singleton dimension in the result tensor is inverse-broadcasting, because there must be at least one non-1 dimension + // in one of the inputs, otherwise the entire dimension would have been optimized away above. + vector isReducingDim(dims); // true for each inverse-broadcasting dimension + for (size_t k = 0; k < dims; k++) + isReducingDim[k] = shapes.back()[k] == 1; + + // form the regular (non-inverse-broadcasting) dims + array, N> regularStrides; + for (size_t i = 0; i < N; i++) + regularStrides[i] = shapes[i].DropSingletonDims(isReducingDim).GetStrides(); + auto regularOpDims = TensorShape(opDims).DropSingletonDims(isReducingDim).GetDims(); // (ugh) + + // form the inverse-broadcasting dims + vector isRegularDim(dims); // true for each inverse-broadcasting dimension + for (size_t k = 0; k < dims; k++) + isRegularDim[k] = !isReducingDim[k]; // (no way to do this more nicely?) + array, N> reducingStrides; + for (size_t i = 0; i < N; i++) + reducingStrides[i] = shapes[i].DropSingletonDims(isRegularDim).GetStrides(); + auto reducingOpDims = TensorShape(opDims).DropSingletonDims(isReducingDim).GetDims(); // (ugh) + // now perform the operation - fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); - // :) - beta; alpha; + array offsets = { a.GetShape().GetOffset(), b.GetShape().GetOffset(), c.GetShape().GetOffset() }; + c.GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } // simple test function for testing stuff diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h index be037fa5b..7802f908d 100644 --- a/Source/Math/TensorView.h +++ b/Source/Math/TensorView.h @@ -43,7 +43,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // accessors // ------------------------------------------------------------------- - const Matrix & GetSOB() const { return *m_sob; } + Matrix & GetSOB() const { return *m_sob; } const TensorShape & GetShape() const { return m_shape; } // ------------------------------------------------------------------- From 38cb2fa9ecf788285b9358595b8f5b115fffb6bd Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 17 Dec 2015 16:14:54 -0800 Subject: [PATCH 06/19] bug fix in MBLayout: We should not guard against all parallel sequences having a gap at a time step, as that happens in truncated BPTT, and it would be much more complex to fix the reader, so we allow it --- Source/Common/Include/Sequences.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 15484458f..2d5543cc4 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -109,14 +109,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_numTimeSteps = numTimeSteps; // allocate lookup tables (note: except at the start, these don't really allocate new memory most of the time) #if 1 - if (m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps) // sanity check for debugging a regression + if ((m_distanceToStart.GetNumRows() != m_numParallelSequences || m_distanceToStart.GetNumCols() != m_numTimeSteps) && m_numTimeSteps > 0) // sanity check for debugging a regression fprintf(stderr, "MBLayout::Init: Resizing m_distanceToStart from %d x %d to %d x %d\n", (int)m_distanceToStart.GetNumRows(), (int)m_distanceToStart.GetNumCols(), (int)m_numParallelSequences, (int)m_numTimeSteps); // (I really want to know about actual allocations, but this is a necessary condition for them) #endif m_distanceToStart.Resize(m_numParallelSequences, m_numTimeSteps); m_distanceToEnd.Resize(m_numParallelSequences, m_numTimeSteps); - m_distanceToNearestStart.assign(m_numTimeSteps, SIZE_MAX); - m_distanceToNearestEnd.assign(m_numTimeSteps, SIZE_MAX); + m_distanceToNearestStart.assign(m_numTimeSteps, PTRDIFF_MAX); + m_distanceToNearestEnd.assign(m_numTimeSteps, PTRDIFF_MAX); m_timeStepHasGap.assign(m_numTimeSteps, false); m_columnsValidityMask.Resize(0, 0); // invalidate // reset state @@ -190,8 +190,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("AddSequence: Sequence added to an MBLayout must overlap with minibatch."); // remember it -#if 1 - auto cap = m_sequences.capacity(); // some sanity check for debugging a speed regression +#ifdef _DEBUG + auto cap = m_sequences.capacity(); // Some sanity check for debugging a speed regression. This should only show up during the first minibatches, and growing only. m_sequences.push_back(seqDesc); if (cap != m_sequences.capacity()) fprintf(stderr, "AddSequence: m_sequences was reallocated from capacity %d to %d\n", (int)cap, (int)m_sequences.capacity()); @@ -218,8 +218,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { // update the nearest sentence boundaries, minimum over all parallel sequences // If 0, then we are on a boundary. If not 0, we can still test in presence of FrameRange.m_timeOffset. - size_t distanceToStart = (size_t)((ptrdiff_t)t - beginTime); - size_t distanceToEnd = endTime - 1 - t; + ptrdiff_t distanceToStart = (ptrdiff_t)t - beginTime; + ptrdiff_t distanceToEnd = (ptrdiff_t)(endTime - 1 - t); m_distanceToStart(s, t) = (float)distanceToStart; m_distanceToEnd(s, t) = (float)distanceToEnd; // and the aggregate @@ -355,7 +355,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // m_distanceToNearestStart = [ 0 1 2 3 4 ] // m_distanceToNearestEnd = [ 2 1 0 1 0 ] Matrix m_distanceToStart, m_distanceToEnd; // (s,t); value<0 stands for gap - vector m_distanceToNearestStart, m_distanceToNearestEnd; // [t] (does not store info about gaps; consult m_timeStepHasGap[] vector instead) + vector m_distanceToNearestStart, m_distanceToNearestEnd; // [t] (does not store info about gaps; consult m_timeStepHasGap[] vector instead) vector m_timeStepHasGap; // [t] true if at least one gap in time step t @@ -551,7 +551,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (s == SIZE_MAX) // aggregate requested { // determine flags from aggregate vectors - assert(m_distanceToNearestStart[t] != SIZE_MAX); // (sanity check) + // Note: We allow that all parallel sequences contain gaps (m_distanceToNearestStart[t] == PTRDIFF_MAX) + // because that makes implementation of the reader easier for truncated BPTT (it knows too late that there are not that many frames left). auto distanceToStart = (ptrdiff_t)m_distanceToNearestStart[t]; if (distanceToStart < -fr.m_timeOffset) return true; From 928da8828c2bf01f6c6b33f905ca7ff7d6ce8434 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 17 Dec 2015 17:04:55 -0800 Subject: [PATCH 07/19] first version of CPU implementation of TensorView::DoSumOf() working now --- Source/Common/Include/DataTensor.h | 11 +++--- Source/Math/CPUMatrix.cpp | 2 +- Source/Math/Matrix.h | 2 ++ Source/Math/TensorView.cpp | 57 +++++++++++++++++++++--------- 4 files changed, 48 insertions(+), 24 deletions(-) diff --git a/Source/Common/Include/DataTensor.h b/Source/Common/Include/DataTensor.h index 0152343d0..e661efe79 100644 --- a/Source/Common/Include/DataTensor.h +++ b/Source/Common/Include/DataTensor.h @@ -227,19 +227,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { result.m_strides[k] = /*result.m_dims[k - 1] *, it's 1 */ result.m_strides[k - 1]; return result; } - TensorShape DropSingletonDims(const std::vector & toDrop) const // flatten [k] with [k-1] if toFlatten[k] is set + TensorShape DropDims(const std::vector & toDrop) const // remove dimension { + // this deletes a dimension while retaining strides + // This implies a slice to [0] for this dimension. TensorShape result = *this; size_t j = 0; for (size_t k = 0; k < size(); k++) { if (toDrop[k]) - { - if (result.m_dims[k] != 1) - LogicError("DeropSingletonDims() cannot drop non-singleton dimensions."); - else - continue; - } + continue; else { // example diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 092cd8c9e..788b6287e 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -5621,7 +5621,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { const std::vector & regularOpDims, const std::array, N> & regularStrides, const std::vector & reducingOpDims, const std::array, N> & reducingStrides) { - size_t dims = regularOpDims.size(); + size_t dims = reducingOpDims.size(); switch (dims) { case 2: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 0a6c488c4..1f7f1330f 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -17,6 +17,7 @@ #include #include // for shared_ptr #include +#include // This class is exported from the Math.dll namespace Microsoft { namespace MSR { namespace CNTK { @@ -200,6 +201,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void SetValue(const Matrix& deepCopyFrom, const MatrixFormat format=matrixFormatSparseCSR); void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType *pArray, const size_t matrixFlags = matrixFlagNormal); void SetValue(const size_t rIdx, const size_t cIdx, ElemType val); // set matrix sparsely + void SetValue(const size_t numRows, const size_t numCols, std::initializer_list l) { std::vector vals(l); assert(vals.size() == numRows * numCols); SetValue(numRows, numCols, GetDeviceId(), vals.data(), matrixFormatRowMajor); } // SetValue(2,3, {1,2,3, 4,5,6}); static ElemType MakeNan(size_t payload); void Invalidate() { SetValue(MakeNan(__LINE__)); } void SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE *h_CSCCol, const CPUSPARSE_INDEX_TYPE *h_Row, const ElemType *h_Val, diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index 2a64f3e64..f91e11899 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -129,8 +129,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { neither:; } for (size_t i = 0; i < N; i++) - shapes[i] = shapes[i].DropSingletonDims(toDrop); - opDims = TensorShape(opDims).DropSingletonDims(toDrop).GetDims(); // (ugh) + shapes[i] = shapes[i].DropDims(toDrop); + opDims = TensorShape(opDims).DropDims(toDrop).GetDims(); // (ugh) // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed //fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); @@ -173,8 +173,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // form the regular (non-inverse-broadcasting) dims array, N> regularStrides; for (size_t i = 0; i < N; i++) - regularStrides[i] = shapes[i].DropSingletonDims(isReducingDim).GetStrides(); - auto regularOpDims = TensorShape(opDims).DropSingletonDims(isReducingDim).GetDims(); // (ugh) + regularStrides[i] = shapes[i].DropDims(isReducingDim).GetStrides(); + auto regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims(); // (ugh) // form the inverse-broadcasting dims vector isRegularDim(dims); // true for each inverse-broadcasting dimension @@ -182,8 +182,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { isRegularDim[k] = !isReducingDim[k]; // (no way to do this more nicely?) array, N> reducingStrides; for (size_t i = 0; i < N; i++) - reducingStrides[i] = shapes[i].DropSingletonDims(isRegularDim).GetStrides(); - auto reducingOpDims = TensorShape(opDims).DropSingletonDims(isReducingDim).GetDims(); // (ugh) + reducingStrides[i] = shapes[i].DropDims(isRegularDim).GetStrides(); + auto reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims(); // (ugh) // now perform the operation array offsets = { a.GetShape().GetOffset(), b.GetShape().GetOffset(), c.GetShape().GetOffset() }; @@ -195,16 +195,41 @@ namespace Microsoft { namespace MSR { namespace CNTK { template /*static*/ void TensorView::Test() { - Matrix m1(-1); m1.Resize(1, 42); - Matrix m2(-1); m2.Resize(13, 1); - Matrix m3(-1); m3.Resize(13, 21); - TensorShape s1(1, 2, 21); - TensorShape s2(13, 1); - TensorShape s3(13, 1, 21); - let t1 = TensorView(m1, s1); t1; - let t2 = TensorView(m2, s2); t2; - auto t3 = TensorView(m3, s3); t3; - t3.DoSumOf(0, t1, t2, 1); + Matrix m1(-1); + Matrix m2(-1); + Matrix m3(-1); + { + m1.SetValue(2, 3, { 1, 2, 3, + 4, 5, 6 }); + m2.SetValue(2, 1, { 13, + 42 }); + m3.Resize(2, 3); + TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1); + m3.Print(); + } + { + m3.Resize(2, 1); + TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1); + m3.Print(); + } + { + m3.Resize(1, 3); + TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1); + m3.Print(); + } + { + m1.Resize(1, 42); + m2.Resize(13, 1); + m3.Resize(13, 21); + TensorShape s1(1, 2, 21); + TensorShape s2(13, 1); + TensorShape s3(13, 1, 21); + let t1 = TensorView(m1, s1); t1; + let t2 = TensorView(m2, s2); t2; + auto t3 = TensorView(m3, s3); t3; + t3.DoSumOf(0, t1, t2, 1); + m3.Print(); + } } template class TensorView; From 83e5bbc3f538b7c99998a743273497da319ddd14 Mon Sep 17 00:00:00 2001 From: Qiwei Ye Date: Fri, 18 Dec 2015 12:38:39 +0800 Subject: [PATCH 08/19] Revert "Revert "adding an MPI init test in case of that MPI was initialized repeatedly"" This reverts commit 23ebe452a5e35dddfba2d08e8fb3265901bfc8af. --- Source/Common/Include/MPIWrapper.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Source/Common/Include/MPIWrapper.h b/Source/Common/Include/MPIWrapper.h index 781fab023..1ffb16c92 100644 --- a/Source/Common/Include/MPIWrapper.h +++ b/Source/Common/Include/MPIWrapper.h @@ -112,7 +112,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { fprintf(stderr, "MPIWrapper: initializing MPI\n"); fflush(stderr); - MPI_Init_DL() || MpiFail("mpiaggregator: MPI_Init"); + int flag = 0; + MPI_Initialized(&flag); + if (!flag) + { + MPI_Init_DL() || MpiFail("mpiaggregator: MPI_Init"); + } MPI_Comm_rank(MPI_COMM_WORLD, &m_myRank); MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes); m_numNodesInUse = m_numMPINodes; From 7d32cdfd1abc9d6186a49c8591f6353140094a43 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 17 Dec 2015 22:41:19 -0800 Subject: [PATCH 09/19] implemented all binary tensor operators (don't we love macros!) --- Source/Math/CPUMatrix.cpp | 51 +++++++++++++++++++++++--------------- Source/Math/CommonMatrix.h | 11 ++++++++ Source/Math/TensorView.cpp | 25 +++++++++++++------ Source/Math/TensorView.h | 7 +++++- 4 files changed, 66 insertions(+), 28 deletions(-) diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 788b6287e..c7db6e43a 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -5256,27 +5256,27 @@ namespace Microsoft { namespace MSR { namespace CNTK { return *this; } - #pragma endregion Static BLAS Functions - double logadd(double x, double y) + template + ElemType logadd_(ElemType x, ElemType y) { - double temp, diff, z; - - if (x < y) { - temp = x; x = y; y = temp; - } - diff = y - x; - if (diff < MINLOGEXP) + if (x < y) { - return (x < LSMALL)?LZERO:x; + ElemType temp = x; x = y; y = temp; + } + ElemType diff = y - x; + if (diff < (ElemType)MINLOGEXP) + { + return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x; } else { - z = exp(diff); - return x + log(1.0 + z); + ElemType z = exp_(diff); + return x + log_((ElemType)1.0 + z); } } + double logadd(double x, double y) { return logadd_(x, y); } template ElemType CPUMatrix::LogAddSumOfElements() const @@ -5546,8 +5546,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { static inline ElemType Loop(array pointers, const OPFN & opfn, const std::vector & reducingOpDims, const std::array, N> & reducingStrides) { - array strides; - for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled + array strides; // N-1 because last one is the result pointer, which is unused in reduction + for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled strides[i] = reducingStrides[i][(size_t)m]; ElemType aggregate = 0; for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;) @@ -5555,8 +5555,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // need to descend into one loop deeper aggregate += TensorOpReduction::Loop(pointers, opfn, reducingOpDims, reducingStrides); // advance the pointers - for (size_t i = 0; i < N; i++) - pointers[i] += strides[i]; + for (size_t i = 0; i < N - 1; i++) + pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here } return aggregate; } @@ -5653,7 +5653,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + // define a static function for every operation +#define DefOp(op, expr) template static inline ElemType Op ## op(ElemType a, ElemType b) { return expr; } + + DefOp(Sum, a + b); DefOp(Difference, a - b); DefOp(ElementWiseProduct, a*b); DefOp(ElementWiseQuotient, a / b); + DefOp(LogSum, logadd_(a, b)); DefOp(Max, a > b ? a : b); DefOp(Min, a < b ? a : b); + DefOp(EQ, a == b); DefOp(NE, a != b); DefOp(GT, a > b); DefOp(LT, a < b); DefOp(GE, a >= b); DefOp(LE, a <= b); + // perform binary operation 'op' on a and b giving c, reinterpreting the matrices as tensors as specified by the dims and strides + // This maps 'op' to a lambda. template void CPUMatrix::TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, ElemType alpha, ElementWiseOperator op, const std::array & offsets, @@ -5661,12 +5669,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { const std::vector & reducingOpDims, const std::array, 3> & reducingStrides) { array pointers = { a.m_pArray, b.m_pArray, m_pArray }; +#define CaseBinaryTensorOp(oper) \ + case ElementWiseOperator::op ## oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper(*(pp[0]), *(pp[1])); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) switch (op) { - case ElementWiseOperator::opSum: - return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return *(pp[0]) + *(pp[1]); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - default: - LogicError("TensorNnaryOp: Unknown op code %d.", (int)op); + CaseBinaryTensorOp(Sum); CaseBinaryTensorOp(Difference); CaseBinaryTensorOp(ElementWiseProduct); CaseBinaryTensorOp(ElementWiseQuotient); + CaseBinaryTensorOp(LogSum); CaseBinaryTensorOp(Max); CaseBinaryTensorOp(Min); + CaseBinaryTensorOp(EQ); CaseBinaryTensorOp(NE); CaseBinaryTensorOp(GT); CaseBinaryTensorOp(LT); CaseBinaryTensorOp(GE); CaseBinaryTensorOp(LE); + default: LogicError("TensorNnaryOp: Unknown op code %d.", (int)op); } } diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h index afd5d7d62..1a9762233 100644 --- a/Source/Math/CommonMatrix.h +++ b/Source/Math/CommonMatrix.h @@ -53,12 +53,23 @@ namespace Microsoft { namespace MSR { namespace CNTK { opLogSum, opMax, opMin, opEQ, opNE, opGT, opLT, opGE, opLE, // unary (or binary with constant parameter) + opCopy, opNegate, opNot, opSaturate, opAbs, + opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha, opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode. }; + // declare float and double versions of a func under f_ + // e.g. exp_ -> exp(double), expf(float) +#define OverloadUnaryMathFns(func) \ + static inline float func ## _(float arg) { return func ## f(arg); } \ + static inline double func ## _(double arg) { return func(arg); } + + OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log); + OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin); + // ----------------------------------------------------------------------- // various enums to describe // ----------------------------------------------------------------------- diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index f91e11899..e2c3cac27 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -200,22 +200,33 @@ namespace Microsoft { namespace MSR { namespace CNTK { Matrix m3(-1); { m1.SetValue(2, 3, { 1, 2, 3, - 4, 5, 6 }); - m2.SetValue(2, 1, { 13, - 42 }); + 14, 15, 6 }); + m2.SetValue(2, 1, { 42, + 13 }); + + // broadcasting of an input m3.Resize(2, 3); TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1); m3.Print(); - } - { + + TensorView(m3).DoMaxOf(0, TensorView(m1), TensorView(m2), 1); + m3.Print(); + + TensorView(m3).DoGTOf(0, TensorView(m1), TensorView(m2), 1); + m3.Print(); + + // reduction over columns m3.Resize(2, 1); TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1); m3.Print(); - } - { + + // reduction over rows m3.Resize(1, 3); TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1); m3.Print(); + + TensorView(m3).DoLogSumOf(0, TensorView(m1), TensorView(m2), 1); + m3.Print(); } { m1.Resize(1, 42); diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h index 7802f908d..d3b3eef02 100644 --- a/Source/Math/TensorView.h +++ b/Source/Math/TensorView.h @@ -56,7 +56,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs. // ------------------------------------------------------------------- - void DoSumOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::opSum); } +#define DeclareBinaryTensorOp(oper) \ + void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); } + + DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient); + DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min); + DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE); static void Test(); From f54e1feaaa811159f0aa778e2e8d45810a4cd649 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 18 Dec 2015 00:07:59 -0800 Subject: [PATCH 10/19] implemented unary and ternary tensor ops. CPU implementation of elementwise tensor ops is feature complete (but may require optimization) --- Source/Math/CPUMatrix.cpp | 102 ++++++++++++++++++++++++++++++++++--- Source/Math/CPUMatrix.h | 8 +++ Source/Math/CommonMatrix.h | 16 +++--- Source/Math/Matrix.cpp | 30 +++++++++++ Source/Math/Matrix.h | 10 +++- Source/Math/TensorView.cpp | 71 +++++++++++++++++++------- Source/Math/TensorView.h | 16 ++++++ 7 files changed, 220 insertions(+), 33 deletions(-) diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index c7db6e43a..2718ae8e8 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -5582,6 +5582,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { const std::vector & regularOpDims, const std::array, N> & regularStrides, const std::vector & reducingOpDims, const std::array, N> & reducingStrides) { + // TODO: if leading dim is all-ones, we can hard-code the loop and hope the compiler vectorizes for us // non-scalar case: still nested result loops left array strides; for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled @@ -5635,7 +5636,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // This function now expands into different k. template static inline void TensorOpWithFn(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, - const std::array & offsets, + const std::array & offsets, const std::vector & regularOpDims, const std::array, N> & regularStrides, const std::vector & reducingOpDims, const std::array, N> & reducingStrides) { @@ -5653,14 +5654,75 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + template + static inline ElemType Sigmoid(ElemType z) + { + if (z >= 0) + return 1 / (1 + exp_(-z)); + else + { + ElemType v = exp_(z); + return v / (1 + v); + } + } + template + static inline ElemType SigmoidDerivative(ElemType z) + { + ElemType v = Sigmoid(z); + return v * (1 - v); + } + template + static inline ElemType LinearRectifierDerivative(ElemType z) + { + return z > 0 ? (ElemType)1 : 0; + } + template + static inline ElemType Sqrt(ElemType z) + { + return sqrt_(max(0, z)); + } + // define a static function for every operation -#define DefOp(op, expr) template static inline ElemType Op ## op(ElemType a, ElemType b) { return expr; } +#define DefUnaryOp(op, expr) template static inline ElemType Op ## op(ElemType a) { return expr; } - DefOp(Sum, a + b); DefOp(Difference, a - b); DefOp(ElementWiseProduct, a*b); DefOp(ElementWiseQuotient, a / b); - DefOp(LogSum, logadd_(a, b)); DefOp(Max, a > b ? a : b); DefOp(Min, a < b ? a : b); - DefOp(EQ, a == b); DefOp(NE, a != b); DefOp(GT, a > b); DefOp(LT, a < b); DefOp(GE, a >= b); DefOp(LE, a <= b); + DefUnaryOp(Copy, a); + DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a); + DefUnaryOp(Abs, fabs_(a)); + DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a)); + //DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha); - // perform binary operation 'op' on a and b giving c, reinterpreting the matrices as tensors as specified by the dims and strides + // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides + // This maps 'op' to a lambda. + template + void CPUMatrix::TensorOp(ElemType beta, const CPUMatrix& a, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 2> & regularStrides, + const std::vector & reducingOpDims, const std::array, 2> & reducingStrides) + { + array pointers = { a.m_pArray, m_pArray }; +#define CaseUnaryTensorOp(oper) \ + case ElementWiseOperator::op ## oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + switch (op) + { + CaseUnaryTensorOp(Copy); + CaseUnaryTensorOp(Negate); CaseUnaryTensorOp(Not); + CaseUnaryTensorOp(Abs); + CaseUnaryTensorOp(Sigmoid); CaseUnaryTensorOp(SigmoidDerivative); CaseUnaryTensorOp(Tanh); CaseUnaryTensorOp(Sqrt); CaseUnaryTensorOp(Exp); CaseUnaryTensorOp(Log); CaseUnaryTensorOp(LinearRectifierDerivative); CaseUnaryTensorOp(Cosine); CaseUnaryTensorOp(NegativeSine); + // functions with lambda arguments--these are different + //CaseUnaryTensorOp(SaturateBetaAlpha); CaseUnaryTensorOp(SumAlpha); CaseUnaryTensorOp(SubDifferenceToAlpha); CaseUnaryTensorOp(SubDifferenceFromAlpha); + default: LogicError("TensorUnaryOp: Unknown op code %d.", (int)op); + } + } + + // define a static function for every operation +#define DefBinaryOp(op, expr) template static inline ElemType Op ## op(ElemType a, ElemType b) { return expr; } + + DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b); + DefBinaryOp(LogSum, logadd_(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b); + DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b); + + // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides // This maps 'op' to a lambda. template void CPUMatrix::TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, ElemType alpha, ElementWiseOperator op, @@ -5671,13 +5733,37 @@ namespace Microsoft { namespace MSR { namespace CNTK { array pointers = { a.m_pArray, b.m_pArray, m_pArray }; #define CaseBinaryTensorOp(oper) \ case ElementWiseOperator::op ## oper: \ - return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper(*(pp[0]), *(pp[1])); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) switch (op) { CaseBinaryTensorOp(Sum); CaseBinaryTensorOp(Difference); CaseBinaryTensorOp(ElementWiseProduct); CaseBinaryTensorOp(ElementWiseQuotient); CaseBinaryTensorOp(LogSum); CaseBinaryTensorOp(Max); CaseBinaryTensorOp(Min); CaseBinaryTensorOp(EQ); CaseBinaryTensorOp(NE); CaseBinaryTensorOp(GT); CaseBinaryTensorOp(LT); CaseBinaryTensorOp(GE); CaseBinaryTensorOp(LE); - default: LogicError("TensorNnaryOp: Unknown op code %d.", (int)op); + default: LogicError("TensorBinaryOp: Unknown op code %d.", (int)op); + } + } + + // define a static function for every operation +#define DefTernaryOp(op, expr) template static inline ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; } + + DefTernaryOp(Cond, a ? b : c); + + // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides + // This maps 'op' to a lambda. + template + void CPUMatrix::TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 4> & regularStrides, + const std::vector & reducingOpDims, const std::array, 4> & reducingStrides) + { + array pointers = { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray }; +#define CaseTernaryTensorOp(oper) \ + case ElementWiseOperator::op ## oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + switch (op) + { + CaseTernaryTensorOp(Cond); + default: LogicError("TensorTernaryOp: Unknown op code %d.", (int)op); } } diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 6128204c4..18b4dd5fa 100644 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -335,10 +335,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix& b, CPUMatrix& c); + void TensorOp(ElemType beta, const CPUMatrix& a, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 2> & regularStrides, + const std::vector & reducingOpDims, const std::array, 2> & reducingStrides); void TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, ElemType alpha, ElementWiseOperator op, const std::array & offsets, const std::vector & regularOpDims, const std::array, 3> & regularStrides, const std::vector & reducingOpDims, const std::array, 3> & reducingStrides); + void TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 4> & regularStrides, + const std::vector & reducingOpDims, const std::array, 4> & reducingStrides); static CPUMatrix Ones(const size_t rows, const size_t cols); static CPUMatrix Zeros(const size_t rows, const size_t cols); diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h index 1a9762233..8bae0cfd5 100644 --- a/Source/Math/CommonMatrix.h +++ b/Source/Math/CommonMatrix.h @@ -48,16 +48,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { enum ElementWiseOperator { + // unary (or binary with constant parameter) + opCopy, + opNegate, opNot, + opAbs, + opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine, + // these are not implemented yet: + opSaturateBetaAlpha, opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha, // binary opSum, opDifference, opElementWiseProduct, opElementWiseQuotient, opLogSum, opMax, opMin, opEQ, opNE, opGT, opLT, opGE, opLE, - // unary (or binary with constant parameter) - opCopy, - opNegate, opNot, - opSaturate, opAbs, - opSumAlpha, opSubDifferenceToAlpha, opSubDifferenceFromAlpha, - opSigmoid, opSigmoidDerivative, opTanh, opSqrt, opExp, opLog, opLinearRectifierDerivative, opCosine, opNegativeSine + // ternary + opCond // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode. }; @@ -67,6 +70,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { static inline float func ## _(float arg) { return func ## f(arg); } \ static inline double func ## _(double arg) { return func(arg); } + OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt); OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log); OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin); diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index d49caee4e..f265413cd 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -5179,6 +5179,21 @@ namespace Microsoft { namespace MSR { namespace CNTK { } #pragma endregion Static BLAS Functions + template + void Matrix::TensorOp(ElemType beta, const Matrix& a, ElemType alpha, ElementWiseOperator op, + const array & offsets, + const vector & regularOpDims, const array, 2> & regularStrides, + const vector & reducingOpDims, const array, 2> & reducingStrides) + { + DISPATCH_MATRIX_ON_FLAG(this, + this, + m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides), + NOT_IMPLEMENTED, //m_GPUMatrix->TensorOp(beta, offsets, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + } + template void Matrix::TensorOp(ElemType beta, const Matrix& a, const Matrix& b, ElemType alpha, ElementWiseOperator op, const array & offsets, @@ -5194,6 +5209,21 @@ namespace Microsoft { namespace MSR { namespace CNTK { ); } + template + void Matrix::TensorOp(ElemType beta, const Matrix& a, const Matrix& b, const Matrix& c, ElemType alpha, ElementWiseOperator op, + const array & offsets, + const vector & regularOpDims, const array, 4> & regularStrides, + const vector & reducingOpDims, const array, 4> & reducingStrides) + { + DISPATCH_MATRIX_ON_FLAG(this, + this, + m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides), + NOT_IMPLEMENTED, //m_GPUMatrix->TensorOp(beta, offsets, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + } + template class Matrix; template class Matrix; diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 1f7f1330f..b1a2aa9fa 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -378,7 +378,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void VectorMax(Matrix& maxIndexes, Matrix& maxValues, const bool isColWise, int topK) const; void VectorMin(Matrix& minIndexes, Matrix& minValues, const bool isColWise) const; - Matrix& AssignNumOfDiff(const Matrix& a, const Matrix& b, bool searchInCol = false); + Matrix& AssignNumOfDiff(const Matrix& a, const Matrix& b, bool searchInCol = false); Matrix& AssignInnerProductOfMatrices(const Matrix& a, const Matrix& b); //this method will resize(1,1) first @@ -461,10 +461,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix& b, Matrix& c); + void TensorOp(ElemType beta, const Matrix& a, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 2> & regularStrides, + const std::vector & reducingOpDims, const std::array, 2> & reducingStrides); void TensorOp(ElemType beta, const Matrix& a, const Matrix& b, ElemType alpha, ElementWiseOperator op, const std::array & offsets, const std::vector & regularOpDims, const std::array, 3> & regularStrides, const std::vector & reducingOpDims, const std::array, 3> & reducingStrides); + void TensorOp(ElemType beta, const Matrix& a, const Matrix& b, const Matrix& c, ElemType alpha, ElementWiseOperator op, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, 4> & regularStrides, + const std::vector & reducingOpDims, const std::array, 4> & reducingStrides); public: void Read(File& stream); void Write(File& stream) const; diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index e2c3cac27..0676b014d 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -57,17 +57,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { static bool Matches(size_t d1, size_t d2) { return d1 == 1 || d2 == 1 || d1 == d2; } // do two dimensions match? - template - void TensorView::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op) + template + static void PrepareTensorOperands(array shapes, array & offsets, + vector & regularOpDims, + array, N> & regularStrides, + vector & reducingOpDims, + array, N> & reducingStrides) { -#define N 3 // later make this a template parameter. N=1 is possible for generators, such as constants. - array shapes; - TensorView & c = *this; - - shapes[0] = a.GetShape(); - shapes[1] = b.GetShape(); - shapes[2] = c.GetShape(); // last one is the output - // massage TensorShapes // Note that TensorShapes here may be shapes are stored or shapes with stride magic applied. @@ -131,6 +127,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (size_t i = 0; i < N; i++) shapes[i] = shapes[i].DropDims(toDrop); opDims = TensorShape(opDims).DropDims(toDrop).GetDims(); // (ugh) + dims = opDims.size(); // #dims has changed + for (size_t i = 0; i < N; i++) + assert(dims == shapes[i].size()); // note: if op is a scalar, then we end up with 0 dimensions here, which is allowed //fprintf(stderr, "Post-drop: Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); @@ -141,7 +140,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (size_t i = 0; i < N; i++) shapes[i] = shapes[i].WithBroadcastStrides(); - fprintf(stderr, "Op %d: %s op %s -> %s via %s\n", (int)op, string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); + //fprintf(stderr, "%s op %s -> %s via %s\n", string(shapes[0]).c_str(), string(shapes[1]).c_str(), string(shapes[2]).c_str(), string(TensorShape(opDims)).c_str()); // determine inverse broadcasting dimensions // Inverse broadcasting dims are actual for loops in the kernel, whereas broadcasting input dims are handled by the thread index. @@ -171,23 +170,55 @@ namespace Microsoft { namespace MSR { namespace CNTK { isReducingDim[k] = shapes.back()[k] == 1; // form the regular (non-inverse-broadcasting) dims - array, N> regularStrides; for (size_t i = 0; i < N; i++) regularStrides[i] = shapes[i].DropDims(isReducingDim).GetStrides(); - auto regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims(); // (ugh) + regularOpDims = TensorShape(opDims).DropDims(isReducingDim).GetDims(); // (ugh) // form the inverse-broadcasting dims vector isRegularDim(dims); // true for each inverse-broadcasting dimension for (size_t k = 0; k < dims; k++) isRegularDim[k] = !isReducingDim[k]; // (no way to do this more nicely?) - array, N> reducingStrides; for (size_t i = 0; i < N; i++) reducingStrides[i] = shapes[i].DropDims(isRegularDim).GetStrides(); - auto reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims(); // (ugh) + reducingOpDims = TensorShape(opDims).DropDims(isRegularDim).GetDims(); // (ugh) + + for (size_t i = 0; i < N; i++) + offsets[i] = shapes[i].GetOffset(); + } + + template + void TensorView::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op) + { + // prepare all tensor descriptor information as needed for execution + array offsets; + array, 2> regularStrides, reducingStrides; + vector regularOpDims, reducingOpDims; + PrepareTensorOperands(array { a.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); // now perform the operation - array offsets = { a.GetShape().GetOffset(), b.GetShape().GetOffset(), c.GetShape().GetOffset() }; - c.GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + } + + template + void TensorView::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op) + { + array offsets; + array, 3> regularStrides, reducingStrides; + vector regularOpDims, reducingOpDims; + PrepareTensorOperands(array { a.GetShape(), b.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + + GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + } + + template + void TensorView::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op) + { + array offsets; + array, 4> regularStrides, reducingStrides; + vector regularOpDims, reducingOpDims; + PrepareTensorOperands(array { a.GetShape(), b.GetShape(), c.GetShape(), GetShape() }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + + GetSOB().TensorOp(beta, a.GetSOB(), b.GetSOB(), c.GetSOB(), alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } // simple test function for testing stuff @@ -204,8 +235,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { m2.SetValue(2, 1, { 42, 13 }); - // broadcasting of an input + // unary ops m3.Resize(2, 3); + TensorView(m3).DoSqrtOf(0, TensorView(m1), 1); + m3.Print(); + + // broadcasting of an input TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1); m3.Print(); diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h index d3b3eef02..1ceb0332a 100644 --- a/Source/Math/TensorView.h +++ b/Source/Math/TensorView.h @@ -56,6 +56,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs. // ------------------------------------------------------------------- +#define DeclareUnaryTensorOp(oper) \ + void Do ## oper ## Of(ElemType beta, const TensorView & a, ElemType alpha) { DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op ## oper); } + + DeclareUnaryTensorOp(Copy); + DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not); + DeclareUnaryTensorOp(Abs); + DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine); + DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha); + #define DeclareBinaryTensorOp(oper) \ void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); } @@ -63,11 +72,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min); DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE); +#define DeclareTernaryTensorOp(oper) \ + void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha) { DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op ## oper); } + + DeclareTernaryTensorOp(Cond); + static void Test(); private: + void DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op); void DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op); + void DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op); // ------------------------------------------------------------------- // sob members From 679c3c52984b1c5854d8003e2950501eeaaaee4a Mon Sep 17 00:00:00 2001 From: Mark Hillebrand Date: Tue, 15 Dec 2015 12:39:43 +0000 Subject: [PATCH 11/19] Source/Readers/LMSequenceReader/: also build SequenceWriter on Linux --- Makefile | 1 + .../LMSequenceReader/SequenceWriter.cpp | 4 +-- .../Readers/LMSequenceReader/SequenceWriter.h | 36 ++++++++++--------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index 8c01ff667..fbff1ad0e 100644 --- a/Makefile +++ b/Makefile @@ -309,6 +309,7 @@ LMSEQUENCEREADER_SRC =\ $(SOURCEDIR)/Readers/LMSequenceReader/Exports.cpp \ $(SOURCEDIR)/Readers/LMSequenceReader/SequenceParser.cpp \ $(SOURCEDIR)/Readers/LMSequenceReader/SequenceReader.cpp \ + $(SOURCEDIR)/Readers/LMSequenceReader/SequenceWriter.cpp \ LMSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LMSEQUENCEREADER_SRC)) diff --git a/Source/Readers/LMSequenceReader/SequenceWriter.cpp b/Source/Readers/LMSequenceReader/SequenceWriter.cpp index 915052c72..5a74afd98 100644 --- a/Source/Readers/LMSequenceReader/SequenceWriter.cpp +++ b/Source/Readers/LMSequenceReader/SequenceWriter.cpp @@ -4,10 +4,10 @@ // // -// - #include "stdafx.h" +#ifdef _WIN32 #include +#endif #include "Basics.h" #include #include diff --git a/Source/Readers/LMSequenceReader/SequenceWriter.h b/Source/Readers/LMSequenceReader/SequenceWriter.h index 99eec4da3..06ab0fd3f 100644 --- a/Source/Readers/LMSequenceReader/SequenceWriter.h +++ b/Source/Readers/LMSequenceReader/SequenceWriter.h @@ -12,21 +12,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { - template - void DATAWRITER_API GetWriter(IDataWriter** pwriter) - { - *pwriter = new LMSequenceWriter(); - } - - extern "C" DATAWRITER_API void GetWriterF(IDataWriter** pwriter) - { - GetWriter(pwriter); - } - extern "C" DATAWRITER_API void GetWriterD(IDataWriter** pwriter) - { - GetWriter(pwriter); - } - template class LMSequenceWriter : public IDataWriter { @@ -65,8 +50,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { } public: + using LabelType = typename IDataWriter::LabelType; + using LabelIdType = typename IDataWriter::LabelIdType; void GetSections(std::map& /*sections*/){} - void SaveMapping(std::wstring saveId, const std::map& /*labelMapping*/){} + void SaveMapping(std::wstring saveId, const std::map& /*labelMapping*/){} public: template @@ -77,4 +64,21 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual bool SaveData(size_t recordStart, const std::map& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized); }; + template + void DATAWRITER_API GetWriter(IDataWriter** pwriter) + { + assert(pwriter != nullptr); + *pwriter = new LMSequenceWriter(); + assert(*pwriter != nullptr); + } + + extern "C" DATAWRITER_API void GetWriterF(IDataWriter** pwriter) + { + GetWriter(pwriter); + } + extern "C" DATAWRITER_API void GetWriterD(IDataWriter** pwriter) + { + GetWriter(pwriter); + } + }}} From 91eadb058777589aba4a2ed11bc3530068a272c1 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 18 Dec 2015 08:54:19 -0800 Subject: [PATCH 12/19] moved all tensor ops to a new header TensorOps.h so they can be shared between matrix types; also moved the float/double-unified math overloads (e.g. exp_()) there, as well as additional typically needed functions such as Sigmoid() --- .../lyx/CNTKBook_CN_Chapter.lyx | 2 +- Source/Math/CPUMatrix.cpp | 128 +++++------------ Source/Math/CommonMatrix.h | 24 ++-- Source/Math/Math.vcxproj | 1 + Source/Math/Math.vcxproj.filters | 3 + Source/Math/Matrix.cpp | 2 + Source/Math/TensorOps.h | 132 ++++++++++++++++++ Source/Math/TensorView.h | 29 ++-- 8 files changed, 206 insertions(+), 115 deletions(-) create mode 100644 Source/Math/TensorOps.h diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx index 2563ad515..8e9a5c845 100644 --- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx +++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx @@ -9154,7 +9154,7 @@ L \begin_layout Standard \begin_inset Formula \begin{eqnarray} -\alpha_{t}\left(i\right) & \leftarrow & h_{it}+logadd_{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\ +\alpha_{t}\left(i\right) & \leftarrow & h_{it}+LogAdd{k}\left(\delta_{t-1}(k)+\eta a_{ki}\right)\\ \mathbf{\frac{\partial R}{\partial\delta_{t-1}(i)}} & \leftarrow & \sum_{j}\frac{\partial C_{logadd}}{\partial\delta_{t}(j)}\frac{\exp(\delta_{t-1}(i)+a_{i,j})}{\sum_{k}\exp(\delta_{t-1}(k)+a_{k,j})}\\ \mathbf{\frac{\partial R}{\partial\delta_{T}(i)}} & \leftarrow & \frac{\exp(\delta_{T}(i))}{\sum_{k}\exp(\delta_{T}(k))}\\ \frac{\partial R}{\partial h_{t}(i)} & \leftarrow & l_{t}(i)-\frac{\partial R}{\partial\delta_{t}(i)}\\ diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 2718ae8e8..ca08faf71 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -9,12 +9,13 @@ #include "stdafx.h" #include "Basics.h" #include "File.h" - +#include "CPUMatrix.h" +#include "TensorOps.h" #include #include #include #include -#include "CPUMatrix.h" + #include #include #include @@ -4304,7 +4305,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (sample_id == 0) sample_prob = -sample_prob; double score_noise = log_num_noise_samples + sample_prob; - double z = logadd(score, score_noise); + double z = LogAdd(score, score_noise); double logprob = score - z; double logprob_noise = score_noise - z; tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob); @@ -5258,32 +5259,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { #pragma endregion Static BLAS Functions - template - ElemType logadd_(ElemType x, ElemType y) - { - if (x < y) - { - ElemType temp = x; x = y; y = temp; - } - ElemType diff = y - x; - if (diff < (ElemType)MINLOGEXP) - { - return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x; - } - else - { - ElemType z = exp_(diff); - return x + log_((ElemType)1.0 + z); - } - } - double logadd(double x, double y) { return logadd_(x, y); } + // 'double' version of LogAdd + double LogAddD(double x, double y) { return LogAdd(x, y); } template ElemType CPUMatrix::LogAddSumOfElements() const { ElemType fAlpha = (ElemType)LZERO; for (int k = 0; k < GetNumElements(); k++) - fAlpha = (ElemType) logadd(fAlpha, m_pArray[k]); + fAlpha = (ElemType) LogAddD(fAlpha, m_pArray[k]); return fAlpha; } @@ -5330,7 +5314,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { fSum = (ElemType)LZERO; for (int j = 0; j < iNumLab; j++) { - fSum = (ElemType)logadd((double)fSum, alpha(j, t)); + fSum = (ElemType)LogAddD(fSum, alpha(j, t)); } fTmp = alpha(k, t) - fSum; @@ -5343,10 +5327,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { fSum = (ElemType)LZERO; for (int m = 0; m < iNumLab; m++) { - fSum = (ElemType)logadd((double)fSum, alpha(m, t) + pair_scores(j, m)); + fSum = (ElemType)LogAddD(fSum, alpha(m, t) + pair_scores(j, m)); } - fTmp = (ElemType)logadd(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum); + fTmp = (ElemType)LogAddD(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum); } beta(k, t) = fTmp; } @@ -5455,7 +5439,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { else{ fTmp2 = a(k, 0); } - fSum = (ElemType)logadd(fSum, fTmp2 + pair_scores(j, k)); + fSum = (ElemType)LogAddD(fSum, fTmp2 + pair_scores(j, k)); } fTmp -= fSum; @@ -5537,6 +5521,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // TensorView support // ----------------------------------------------------------------------- + // To save time, this makes extensive use of templates and macros. + // perform loop over reduction index m // This function is declared inside a wrapper struct to allow partial specialization (m = -1). template @@ -5654,43 +5640,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - template - static inline ElemType Sigmoid(ElemType z) - { - if (z >= 0) - return 1 / (1 + exp_(-z)); - else - { - ElemType v = exp_(z); - return v / (1 + v); - } - } - template - static inline ElemType SigmoidDerivative(ElemType z) - { - ElemType v = Sigmoid(z); - return v * (1 - v); - } - template - static inline ElemType LinearRectifierDerivative(ElemType z) - { - return z > 0 ? (ElemType)1 : 0; - } - template - static inline ElemType Sqrt(ElemType z) - { - return sqrt_(max(0, z)); - } - - // define a static function for every operation -#define DefUnaryOp(op, expr) template static inline ElemType Op ## op(ElemType a) { return expr; } - - DefUnaryOp(Copy, a); - DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a); - DefUnaryOp(Abs, fabs_(a)); - DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a)); - //DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha); - // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides // This maps 'op' to a lambda. template @@ -5699,29 +5648,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { const std::vector & regularOpDims, const std::array, 2> & regularStrides, const std::vector & reducingOpDims, const std::array, 2> & reducingStrides) { + #define CaseUnaryTensorOp(oper) \ + case ElementWiseOperator::op ## oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + array pointers = { a.m_pArray, m_pArray }; -#define CaseUnaryTensorOp(oper) \ - case ElementWiseOperator::op ## oper: \ - return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) switch (op) { - CaseUnaryTensorOp(Copy); - CaseUnaryTensorOp(Negate); CaseUnaryTensorOp(Not); - CaseUnaryTensorOp(Abs); - CaseUnaryTensorOp(Sigmoid); CaseUnaryTensorOp(SigmoidDerivative); CaseUnaryTensorOp(Tanh); CaseUnaryTensorOp(Sqrt); CaseUnaryTensorOp(Exp); CaseUnaryTensorOp(Log); CaseUnaryTensorOp(LinearRectifierDerivative); CaseUnaryTensorOp(Cosine); CaseUnaryTensorOp(NegativeSine); - // functions with lambda arguments--these are different - //CaseUnaryTensorOp(SaturateBetaAlpha); CaseUnaryTensorOp(SumAlpha); CaseUnaryTensorOp(SubDifferenceToAlpha); CaseUnaryTensorOp(SubDifferenceFromAlpha); + ForAllUnaryOps(CaseUnaryTensorOp); default: LogicError("TensorUnaryOp: Unknown op code %d.", (int)op); } } - // define a static function for every operation -#define DefBinaryOp(op, expr) template static inline ElemType Op ## op(ElemType a, ElemType b) { return expr; } - - DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b); - DefBinaryOp(LogSum, logadd_(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b); - DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b); - // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides // This maps 'op' to a lambda. template @@ -5730,24 +5668,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { const std::vector & regularOpDims, const std::array, 3> & regularStrides, const std::vector & reducingOpDims, const std::array, 3> & reducingStrides) { + #define CaseBinaryTensorOp(oper) \ + case ElementWiseOperator::op ## oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + array pointers = { a.m_pArray, b.m_pArray, m_pArray }; -#define CaseBinaryTensorOp(oper) \ - case ElementWiseOperator::op ## oper: \ - return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0])), (*(pp[1]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) switch (op) { - CaseBinaryTensorOp(Sum); CaseBinaryTensorOp(Difference); CaseBinaryTensorOp(ElementWiseProduct); CaseBinaryTensorOp(ElementWiseQuotient); - CaseBinaryTensorOp(LogSum); CaseBinaryTensorOp(Max); CaseBinaryTensorOp(Min); - CaseBinaryTensorOp(EQ); CaseBinaryTensorOp(NE); CaseBinaryTensorOp(GT); CaseBinaryTensorOp(LT); CaseBinaryTensorOp(GE); CaseBinaryTensorOp(LE); + ForAllBinaryOps(CaseBinaryTensorOp); default: LogicError("TensorBinaryOp: Unknown op code %d.", (int)op); } } - // define a static function for every operation -#define DefTernaryOp(op, expr) template static inline ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; } - - DefTernaryOp(Cond, a ? b : c); - // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides // This maps 'op' to a lambda. template @@ -5756,18 +5688,22 @@ namespace Microsoft { namespace MSR { namespace CNTK { const std::vector & regularOpDims, const std::array, 4> & regularStrides, const std::vector & reducingOpDims, const std::array, 4> & reducingStrides) { + #define CaseTernaryTensorOp(oper) \ + case ElementWiseOperator::op ## oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + array pointers = { a.m_pArray, b.m_pArray, c.m_pArray, m_pArray }; -#define CaseTernaryTensorOp(oper) \ - case ElementWiseOperator::op ## oper: \ - return TensorOpWithFn(beta, pointers, alpha, [](const array & pp) { return Op ## oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) switch (op) { - CaseTernaryTensorOp(Cond); + ForAllTernaryOps(CaseTernaryTensorOp); default: LogicError("TensorTernaryOp: Unknown op code %d.", (int)op); } } - // The explicit instantiation part + // ----------------------------------------------------------------------- + // explicit instantiations + // ----------------------------------------------------------------------- + template class MATH_API CPUMatrix; template class MATH_API CPUMatrix; diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h index 8bae0cfd5..8a73246c9 100644 --- a/Source/Math/CommonMatrix.h +++ b/Source/Math/CommonMatrix.h @@ -64,15 +64,23 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Note: not all of the above are actually implement at present; and not all that's implemented has an opcode. }; - // declare float and double versions of a func under f_ - // e.g. exp_ -> exp(double), expf(float) -#define OverloadUnaryMathFns(func) \ - static inline float func ## _(float arg) { return func ## f(arg); } \ - static inline double func ## _(double arg) { return func(arg); } + // helper to apply a C macro for all operations of each kind +#define ForAllUnaryOps(Macro) \ + Macro(Copy); \ + Macro(Negate); Macro(Not); \ + Macro(Abs); \ + Macro(Sigmoid); Macro(SigmoidDerivative); Macro(Tanh); Macro(Sqrt); Macro(Exp); Macro(Log); Macro(LinearRectifierDerivative); Macro(Cosine); Macro(NegativeSine); - OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt); - OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log); - OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin); +#define ForAllParameterizedUnaryOps(Macro) \ + Macro(SaturateBetaAlpha); Macro(SumAlpha); Macro(SubDifferenceToAlpha); Macro(SubDifferenceFromAlpha); + +#define ForAllBinaryOps(Macro) \ + Macro(Sum); Macro(Difference); Macro(ElementWiseProduct); Macro(ElementWiseQuotient); \ + Macro(LogSum); Macro(Max); Macro(Min); \ + Macro(EQ); Macro(NE); Macro(GT); Macro(LT); Macro(GE); Macro(LE); + +#define ForAllTernaryOps(Macro) \ + Macro(Cond); // ----------------------------------------------------------------------- // various enums to describe diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj index 738b59eed..42bd05ebd 100644 --- a/Source/Math/Math.vcxproj +++ b/Source/Math/Math.vcxproj @@ -162,6 +162,7 @@ + diff --git a/Source/Math/Math.vcxproj.filters b/Source/Math/Math.vcxproj.filters index 625886e2e..a46a3807a 100644 --- a/Source/Math/Math.vcxproj.filters +++ b/Source/Math/Math.vcxproj.filters @@ -70,6 +70,9 @@ Tensors + + Tensors + diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index f265413cd..66cea78ae 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -4887,6 +4887,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { return x - y * floor(x / y); } + // TODO: use static LogAdd() as defined in TensorOps.h + // Not doing this currently because that one uses ElemType for all ops, while this one uses double inside. Must compare before making this change. template ElemType Matrix::LogAdd(ElemType x, ElemType y) { diff --git a/Source/Math/TensorOps.h b/Source/Math/TensorOps.h new file mode 100644 index 000000000..1ee9821de --- /dev/null +++ b/Source/Math/TensorOps.h @@ -0,0 +1,132 @@ +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// + +// This implements the elementwise tensor operations, including helper macros and some actual functions. + +#pragma once + +#include "Basics.h" +#include "CommonMatrix.h" + +#pragma push_macro("TENSOR_OPS_DECL") +#ifndef TENSOR_OPS_DECL // to make these accessible to CUDA kernels, say '#define TENSOR_OPS_DECL __device__ __host__' +#define TENSOR_OPS_DECL +#endif + +#pragma push_macro("DECL") +#define DECL static inline TENSOR_OPS_DECL + +// This class is exported from the Math.dll. +namespace Microsoft { namespace MSR { namespace CNTK { + + // ----------------------------------------------------------------------- + // unified overloads for float/double math functions + // + // Declare float and double versions of the functions f we need as f_(), + // e.g. exp_ -> exp(double), expf(float). + // ----------------------------------------------------------------------- + +#pragma push_macro("OverloadUnaryMathFns") + #define OverloadUnaryMathFns(func) \ + DECL float func ## _(float arg) { return func ## f(arg); } \ + DECL double func ## _(double arg) { return func(arg); } + + OverloadUnaryMathFns(fabs); OverloadUnaryMathFns(sqrt); + OverloadUnaryMathFns(exp); OverloadUnaryMathFns(log); + OverloadUnaryMathFns(tanh); OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin); +#pragma push_macro("OverloadUnaryMathFns") + + // ----------------------------------------------------------------------- + // additional functions that are standard in our context + // ----------------------------------------------------------------------- + + template + DECL ElemType Sigmoid(ElemType z) + { + if (z >= 0) + return 1 / (1 + exp_(-z)); + else + { + ElemType v = exp_(z); + return v / (1 + v); + } + } + + template + DECL ElemType SigmoidDerivative(ElemType z) + { + ElemType v = Sigmoid(z); + return v * (1 - v); + } + + template + DECL ElemType LinearRectifierDerivative(ElemType z) + { + return z > 0 ? (ElemType)1 : 0; + } + + template + DECL ElemType Sqrt(ElemType z) + { + // BUGBUG: Why clip to 0? An invalid sqrt() should show up as a NaN in the result, instead of hiding it. + return sqrt_(z > 0 ? z : 0); + } + + // TODO: call this LogAdd() for consistency + template + DECL ElemType LogAdd(ElemType x, ElemType y) + { + if (x < y) + { + ElemType temp = x; x = y; y = temp; + } + ElemType diff = y - x; + if (diff < (ElemType)MINLOGEXP) + { + return (x < (ElemType)LSMALL) ? (ElemType)LZERO : x; + } + else + { + ElemType z = exp_(diff); + return x + log_((ElemType)1.0 + z); + } + } + + // ----------------------------------------------------------------------- + // ElementWiseOperator implementations + // + // Define a static function for every ElementWiseOperator (CommonMatrix.h). + // ----------------------------------------------------------------------- + +#pragma push_macro("DefUnaryOp") + #define DefUnaryOp(op, expr) template DECL ElemType Op ## op(ElemType a) { return expr; } + + DefUnaryOp(Copy, a); + DefUnaryOp(Negate, -a); DefUnaryOp(Not, !a); + DefUnaryOp(Abs, fabs_(a)); + DefUnaryOp(Sigmoid, Sigmoid(a)); DefUnaryOp(SigmoidDerivative, SigmoidDerivative(a)); DefUnaryOp(Tanh, tanh_(a)); DefUnaryOp(Sqrt, Sqrt(a)); DefUnaryOp(Exp, exp_(a)); DefUnaryOp(Log, log_(a)); DefUnaryOp(LinearRectifierDerivative, LinearRectifierDerivative(a)); DefUnaryOp(Cosine, cos_(a)); DefUnaryOp(NegativeSine, -sin_(a)); +#pragma pop_macro("DefUnaryOp") + + // parameterized unary ops + //DefUnaryOp(SaturateBetaAlpha); DefUnaryOp(SumAlpha); DefUnaryOp(SubDifferenceToAlpha); DefUnaryOp(SubDifferenceFromAlpha); + +#pragma push_macro("DefBinaryOp") + #define DefBinaryOp(op, expr) template DECL ElemType Op ## op(ElemType a, ElemType b) { return expr; } + + DefBinaryOp(Sum, a + b); DefBinaryOp(Difference, a - b); DefBinaryOp(ElementWiseProduct, a*b); DefBinaryOp(ElementWiseQuotient, a / b); + DefBinaryOp(LogSum, LogAdd(a, b)); DefBinaryOp(Max, a > b ? a : b); DefBinaryOp(Min, a < b ? a : b); + DefBinaryOp(EQ, a == b); DefBinaryOp(NE, a != b); DefBinaryOp(GT, a > b); DefBinaryOp(LT, a < b); DefBinaryOp(GE, a >= b); DefBinaryOp(LE, a <= b); +#pragma pop_macro("DefBinaryOp") + +#pragma push_macro("DefTernaryOp") + #define DefTernaryOp(op, expr) template DECL ElemType Op ## op(ElemType a, ElemType b, ElemType c) { return expr; } + + DefTernaryOp(Cond, a ? b : c); +#pragma pop_macro("DefTernaryOp") + +}}} +#pragma pop_macro("DECL") +#pragma pop_macro("TENSOR_OPS_DECL") diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h index 1ceb0332a..2baaef473 100644 --- a/Source/Math/TensorView.h +++ b/Source/Math/TensorView.h @@ -4,7 +4,7 @@ // // -// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor. +// This implements the TensorView class, which is a layer around Matrix that reinterprets its content as a generic tensor. [fseide] #pragma once @@ -56,26 +56,35 @@ namespace Microsoft { namespace MSR { namespace CNTK { // If beta == 0, c is not read out, i.e. it can be uninitialized or contain NaNs. // ------------------------------------------------------------------- +#pragma push_macro("DeclareUnaryTensorOp") #define DeclareUnaryTensorOp(oper) \ void Do ## oper ## Of(ElemType beta, const TensorView & a, ElemType alpha) { DoUnaryOpOf(beta, a, alpha, ElementWiseOperator::op ## oper); } - DeclareUnaryTensorOp(Copy); - DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not); - DeclareUnaryTensorOp(Abs); - DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine); - DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha); + ForAllUnaryOps(DeclareUnaryTensorOp); + ForAllParameterizedUnaryOps(DeclareUnaryTensorOp); + //DeclareUnaryTensorOp(Copy); + //DeclareUnaryTensorOp(Negate); DeclareUnaryTensorOp(Not); + //DeclareUnaryTensorOp(Abs); + //DeclareUnaryTensorOp(Sigmoid); DeclareUnaryTensorOp(SigmoidDerivative); DeclareUnaryTensorOp(Tanh); DeclareUnaryTensorOp(Sqrt); DeclareUnaryTensorOp(Exp); DeclareUnaryTensorOp(Log); DeclareUnaryTensorOp(LinearRectifierDerivative); DeclareUnaryTensorOp(Cosine); DeclareUnaryTensorOp(NegativeSine); + //DeclareUnaryTensorOp(SaturateBetaAlpha); DeclareUnaryTensorOp(SumAlpha); DeclareUnaryTensorOp(SubDifferenceToAlpha); DeclareUnaryTensorOp(SubDifferenceFromAlpha); +#pragma pop_macro("DeclareUnaryTensorOp") +#pragma push_macro("DeclareBinaryTensorOp") #define DeclareBinaryTensorOp(oper) \ void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha) { DoBinaryOpOf(beta, a, b, alpha, ElementWiseOperator::op ## oper); } - DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient); - DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min); - DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE); + ForAllBinaryOps(DeclareBinaryTensorOp); + //DeclareBinaryTensorOp(Sum); DeclareBinaryTensorOp(Difference); DeclareBinaryTensorOp(ElementWiseProduct); DeclareBinaryTensorOp(ElementWiseQuotient); + //DeclareBinaryTensorOp(LogSum); DeclareBinaryTensorOp(Max); DeclareBinaryTensorOp(Min); + //DeclareBinaryTensorOp(EQ); DeclareBinaryTensorOp(NE); DeclareBinaryTensorOp(GT); DeclareBinaryTensorOp(LT); DeclareBinaryTensorOp(GE); DeclareBinaryTensorOp(LE); +#pragma pop_macro("DeclareBinaryTensorOp") +#pragma push_macro("DeclareTernaryTensorOp") #define DeclareTernaryTensorOp(oper) \ void Do ## oper ## Of(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha) { DoTernaryOpOf(beta, a, b, c, alpha, ElementWiseOperator::op ## oper); } - DeclareTernaryTensorOp(Cond); + ForAllTernaryOps(DeclareTernaryTensorOp); +#pragma pop_macro("DeclareTernaryTensorOp") static void Test(); From b8de2fef4bcd181eb86ada7ef7f29fa2af2fb65b Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Wed, 16 Dec 2015 18:52:17 -0800 Subject: [PATCH 13/19] Added support for distributed reading in ImageReader. --- Source/Readers/ImageReader/ImageReader.cpp | 23 ++++++++++++++++++---- Source/Readers/ImageReader/ImageReader.h | 10 +++++++++- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/Source/Readers/ImageReader/ImageReader.cpp b/Source/Readers/ImageReader/ImageReader.cpp index e613d5d29..2b38da4a3 100644 --- a/Source/Readers/ImageReader/ImageReader.cpp +++ b/Source/Readers/ImageReader/ImageReader.cpp @@ -16,6 +16,7 @@ #include // TODO: this should go away once we update the parameter parsing #include #include +#include namespace Microsoft { namespace MSR { namespace CNTK { @@ -400,6 +401,10 @@ void ImageReader::InitFromConfig(const ConfigRecordType& config) m_prefetch = config(L"prefetch", true); + int cthread = config(L"numCPUThreads", 0); + if (cthread > 0) + omp_set_num_threads(cthread); + m_epochStart = 0; m_mbStart = 0; } @@ -412,11 +417,16 @@ void ImageReader::Destroy() } template -void ImageReader::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples) +void ImageReader::StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples) { assert(mbSize > 0); + assert(numSubsets > 0); + assert(subsetNum < numSubsets); assert(requestedEpochSamples > 0); + m_subsetNum = subsetNum; + m_numSubsets = numSubsets; + if (m_imgListRand) std::shuffle(m_files.begin(), m_files.end(), m_rng); @@ -505,10 +515,15 @@ size_t ImageReader::ReadImages() std::fill(m_labBuf.begin(), m_labBuf.end(), static_cast(0)); + size_t actualMBSize = mbLim - m_mbStart; + size_t iStart = actualMBSize * m_subsetNum / m_numSubsets; + size_t iLim = actualMBSize * (m_subsetNum + 1) / m_numSubsets; + size_t subsetSize = iLim - iStart; + #pragma omp parallel for ordered schedule(dynamic) - for (long long i = 0; i < static_cast(mbLim - m_mbStart); i++) + for (long long i = 0; i < static_cast(subsetSize); i++) { - const auto& p = m_files[i + m_mbStart]; + const auto& p = m_files[m_mbStart + iStart + i]; cv::Mat img{ cv::imread(p.first, cv::IMREAD_COLOR) }; if (!img.data) RuntimeError("Cannot read image file %s", p.first.c_str()); @@ -522,7 +537,7 @@ size_t ImageReader::ReadImages() m_labBuf[m_labDim * i + p.second] = 1; } - return mbLim - m_mbStart; + return subsetSize; } template class ImageReader; diff --git a/Source/Readers/ImageReader/ImageReader.h b/Source/Readers/ImageReader/ImageReader.h index 32e5a8a07..cab5d07b7 100644 --- a/Source/Readers/ImageReader/ImageReader.h +++ b/Source/Readers/ImageReader/ImageReader.h @@ -39,7 +39,12 @@ public: virtual void Init(const ScriptableObjects::IConfigRecord & config) override { InitFromConfig(config); } #endif void Destroy() override; - void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override; + bool SupportsDistributedMBRead() const { return true; } + void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize) override; + void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override + { + return StartDistributedMinibatchLoop(mbSize, epoch, 0, 1, requestedEpochSamples); + } bool GetMinibatch(std::map*>& matrices) override; bool DataEnd(EndDataType endDataType) override; @@ -73,6 +78,9 @@ private: size_t m_epochStart; size_t m_mbStart; + size_t m_subsetNum; + size_t m_numSubsets; + bool m_prefetch; std::future m_mbPrefetchFut; std::vector m_featBuf; From 1f26215616ee2524d220efa28fcb1379b03ab722 Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Thu, 17 Dec 2015 12:31:17 -0800 Subject: [PATCH 14/19] Fixed mbStart in ImageReader for distributed case. --- Source/Readers/ImageReader/ImageReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Readers/ImageReader/ImageReader.cpp b/Source/Readers/ImageReader/ImageReader.cpp index 2b38da4a3..6b53514a4 100644 --- a/Source/Readers/ImageReader/ImageReader.cpp +++ b/Source/Readers/ImageReader/ImageReader.cpp @@ -467,7 +467,6 @@ bool ImageReader::GetMinibatch(std::map m_pMBLayout->InitAsFrameMode(mbSize); - m_mbStart += mbSize; // It is safe to run prefetching with just one buffer as SetValue is synchronous so there will be no race. m_mbPrefetchFut = std::async(GetLaunchPolicy(m_prefetch), [this]() { return ReadImages(); }); @@ -537,6 +536,7 @@ size_t ImageReader::ReadImages() m_labBuf[m_labDim * i + p.second] = 1; } + m_mbStart += actualMBSize; return subsetSize; } From 1a1bd17c21bd3f4bfb9881a4ef0cde1a0f3abf5f Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 18 Dec 2015 10:01:17 -0800 Subject: [PATCH 15/19] bug fix: ComputationNode::DetermineNumCols() was an outdated pre-refactoring hold-over with a now incorrect validity check. Can just be removed, Should fix reported by user xiaoqing; removed unnecessary ad inconsistent use of 'this->' throughout Matrix.cpp, also fixed some bad indentations --- .../ComputationNetworkLib/ComputationNode.cpp | 3 +- .../ComputationNetworkLib/ComputationNode.h | 12 - Source/Math/Matrix.cpp | 684 +++++++++--------- Source/Math/Matrix.h | 1 + 4 files changed, 344 insertions(+), 356 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index a104632f7..d6738861e 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -43,13 +43,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { // all are consistent: install it LinkToMBLayout(pMBLayout); } + // single input that maps its input element-wise (e.g. Sigmoid) void ComputationNodeBase::ValidateUnaryMap(bool isFinalValidationPass) { assert(m_inputs.size() == 1); ComputationNodeBase::Validate(isFinalValidationPass); InferMBLayoutFromInputsForStandardCase(); - SetDims(m_inputs[0]->GetNumRows(), DetermineNumCols(m_inputs[0])); + SetDims(m_inputs[0]); InferImageDimsFromInputs(); } // binary zip operation, e.g. Plus diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 7f89060a6..a7649e4c7 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -340,18 +340,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } // helper functions for common cases - private: - // determine number of columns from a child and/or layout - size_t DetermineNumCols(const ComputationNodeBasePtr & child) const - { - size_t childCols = child->GetNumCols(); // this is what the child says - if (!m_pMBLayout) // no layout: copy from child - return childCols; - size_t cols = m_pMBLayout->GetNumCols(); // layout: get it from there, but validate against child - if (childCols != cols) - RuntimeError("%ls %ls operation: Mismatch in number of columns", OperationName().c_str(), NodeName().c_str()); - return cols; - } protected: void ValidateUnaryMap(bool isFinalValidationPass); void ValidateUnaryReduce(bool isFinalValidationPass); diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 66cea78ae..c223cdc43 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -529,7 +529,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { template Matrix::~Matrix(void) { - this->Clear(); + Clear(); } template @@ -652,14 +652,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (GetMatrixType() == MatrixType::DENSE) { - for (size_t i = this->GetNumCols()-1; i >= -numShift; i--) - { - Matrix inp = this->ColumnSlice(i + numShift, 1); - Matrix out = this->ColumnSlice(i, 1) ; - out = inp; - } - for (size_t i = 0; i < min(this->GetNumCols(), -numShift); i++) - this->ColumnSlice(i, 1).SetValue(0); + for (size_t i = GetNumCols() - 1; i >= -numShift; i--) + { + Matrix inp = ColumnSlice(i + numShift, 1); + Matrix out = ColumnSlice(i, 1); + out = inp; + } + for (size_t i = 0; i < min(GetNumCols(), -numShift); i++) + ColumnSlice(i, 1).SetValue(0); } else if (GetMatrixType() == MatrixType::SPARSE) { @@ -1029,8 +1029,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - return this->m_CPUMatrix->Get00Element(), - return this->m_GPUMatrix->Get00Element(), + return m_CPUMatrix->Get00Element(), + return m_GPUMatrix->Get00Element(), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1071,7 +1071,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (IsEmpty()) LogicError("Transpose: Matrix is empty."); - Matrix c(this->GetNumCols(), this->GetNumRows(), (DEVICEID_TYPE)this->GetDeviceId()); + Matrix c(GetNumCols(), GetNumRows(), (DEVICEID_TYPE)GetDeviceId()); c.AssignTransposeOf(*this); return c; } @@ -1084,10 +1084,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignTransposeOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignTransposeOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignTransposeOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignTransposeOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignTransposeOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignTransposeOf(*a.m_GPUSparseMatrix) ); return *this; @@ -1149,8 +1149,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->MaskColumnsValue(*columnsMask.m_CPUMatrix, val), - this->m_GPUMatrix->MaskColumnsValue(*columnsMask.m_GPUMatrix, val), + m_CPUMatrix->MaskColumnsValue(*columnsMask.m_CPUMatrix, val), + m_GPUMatrix->MaskColumnsValue(*columnsMask.m_GPUMatrix, val), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1164,8 +1164,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->SetColumn(colPointer,colInd), - this->m_GPUMatrix->SetColumn(colPointer,colInd), + m_CPUMatrix->SetColumn(colPointer,colInd), + m_GPUMatrix->SetColumn(colPointer,colInd), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1189,8 +1189,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->SetColumn(*colMat.m_CPUMatrix,colInd), - this->m_GPUMatrix->SetColumn(*colMat.m_GPUMatrix, colInd), + m_CPUMatrix->SetColumn(*colMat.m_CPUMatrix,colInd), + m_GPUMatrix->SetColumn(*colMat.m_GPUMatrix, colInd), NOT_IMPLEMENTED, NOT_IMPLEMENTED); } @@ -1202,16 +1202,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (this == &deepCopyFrom) return; - this->m_preferredDeviceId = deepCopyFrom.m_preferredDeviceId; + m_preferredDeviceId = deepCopyFrom.m_preferredDeviceId; DecideAndMoveToRightDevice(deepCopyFrom, *this); - this->SwitchToMatrixType(deepCopyFrom.GetMatrixType(), format, false); + SwitchToMatrixType(deepCopyFrom.GetMatrixType(), format, false); DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom, this, - this->m_CPUMatrix->SetValue(*deepCopyFrom.m_CPUMatrix), - this->m_GPUMatrix->SetValue(*deepCopyFrom.m_GPUMatrix), - this->m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix), - this->m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix) + m_CPUMatrix->SetValue(*deepCopyFrom.m_CPUMatrix), + m_GPUMatrix->SetValue(*deepCopyFrom.m_GPUMatrix), + m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix), + m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix) ); } @@ -1391,8 +1391,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { nullptr, ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, - if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*this->m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues), - if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*this->m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues) + if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues), + if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues) ); } @@ -1402,17 +1402,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DecideAndMoveToRightDevice(*this, gradients); - ElemType aveMultiplier = 1.0f; - DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients, - aveMultiplier = m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU), - aveMultiplier = m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU), - aveMultiplier = gradients.m_CPUSparseMatrix->Adagrad(*this->m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU), - aveMultiplier = gradients.m_GPUSparseMatrix->Adagrad(*this->m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU) + return m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU), + return m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU), + return gradients.m_CPUSparseMatrix->Adagrad(*m_CPUMatrix, needAveMultiplier); SetDataLocation(CPU), + return gradients.m_GPUSparseMatrix->Adagrad(*m_GPUMatrix, needAveMultiplier); SetDataLocation(GPU) ); - - return aveMultiplier; } template @@ -1449,17 +1445,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DecideAndMoveToRightDevice(*this, gradients); - ElemType aveMultiplier = 1.0f; - DISPATCH_MATRIX_ON_FLAG(this, &gradients, - aveMultiplier = m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(CPU), - aveMultiplier = m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU), + return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(CPU), + return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - - return aveMultiplier; } template @@ -1610,8 +1602,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->operator+=(*a.m_CPUMatrix), - this->m_GPUMatrix->operator+=(*a.m_GPUMatrix), + m_CPUMatrix->operator+=(*a.m_CPUMatrix), + m_GPUMatrix->operator+=(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED); @@ -1631,7 +1623,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, &c, c += (*this)(0,0), - c += (this->m_GPUMatrix->Get00Element()), // BUGBUG: efficiency + c += (m_GPUMatrix->Get00Element()), // BUGBUG: efficiency c += (*this)(0,0), NOT_IMPLEMENTED ); @@ -1697,8 +1689,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), - this->m_GPUMatrix->AssignRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), + m_CPUMatrix->AssignRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), + m_GPUMatrix->AssignRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1717,8 +1709,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), - this->m_GPUMatrix->AssignToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), + m_CPUMatrix->AssignToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), + m_GPUMatrix->AssignToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1738,8 +1730,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AddToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), - this->m_GPUMatrix->AddToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), + m_CPUMatrix->AddToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), + m_GPUMatrix->AddToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1759,8 +1751,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AddWithRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), - this->m_GPUMatrix->AddWithRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), + m_CPUMatrix->AddWithRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows), + m_GPUMatrix->AddWithRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1842,8 +1834,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignRepeatOf(*a.m_CPUMatrix, numRowRepeats, numColRepeats), - this->m_GPUMatrix->AssignRepeatOf(*a.m_GPUMatrix, numRowRepeats, numColRepeats), + m_CPUMatrix->AssignRepeatOf(*a.m_CPUMatrix, numRowRepeats, numColRepeats), + m_GPUMatrix->AssignRepeatOf(*a.m_GPUMatrix, numRowRepeats, numColRepeats), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1862,8 +1854,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AddToRowRepeatValuesOf(*a.m_CPUMatrix, numRepeats), - this->m_GPUMatrix->AddToRowRepeatValuesOf(*a.m_GPUMatrix, numRepeats), + m_CPUMatrix->AddToRowRepeatValuesOf(*a.m_CPUMatrix, numRepeats), + m_GPUMatrix->AddToRowRepeatValuesOf(*a.m_GPUMatrix, numRepeats), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1885,8 +1877,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber), - this->m_GPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber), + m_CPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber), + m_GPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1908,8 +1900,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber), - this->m_GPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber), + m_CPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber), + m_GPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1928,8 +1920,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignDifferenceOf(alpha,*a.m_CPUMatrix), - this->m_GPUMatrix->AssignDifferenceOf(alpha,*a.m_GPUMatrix), + m_CPUMatrix->AssignDifferenceOf(alpha,*a.m_CPUMatrix), + m_GPUMatrix->AssignDifferenceOf(alpha,*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1948,8 +1940,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignDifferenceOf(*a.m_CPUMatrix, alpha), - this->m_GPUMatrix->AssignDifferenceOf(*a.m_GPUMatrix, alpha), + m_CPUMatrix->AssignDifferenceOf(*a.m_CPUMatrix, alpha), + m_GPUMatrix->AssignDifferenceOf(*a.m_GPUMatrix, alpha), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -1969,8 +1961,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - *this->m_CPUMatrix -= *a.m_CPUMatrix, - *this->m_GPUMatrix -= *a.m_GPUMatrix, + *m_CPUMatrix -= *a.m_CPUMatrix, + *m_GPUMatrix -= *a.m_GPUMatrix, NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2019,7 +2011,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { template Matrix Matrix::operator* (ElemType alpha) const { - Matrix c(GetNumRows(), GetNumCols(), (DEVICEID_TYPE)this->m_preferredDeviceId); + Matrix c(GetNumRows(), GetNumCols(), (DEVICEID_TYPE)m_preferredDeviceId); Scale(alpha, *this, c); return c; } @@ -2081,7 +2073,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, c.AssignProductOf((*this)(0,0), a), - c.AssignProductOf(this->m_GPUMatrix->Get00Element(), a), // BUGBUG: efficiency + c.AssignProductOf(m_GPUMatrix->Get00Element(), a), // BUGBUG: efficiency c.AssignProductOf((*this)(0,0), a), NOT_IMPLEMENTED ); @@ -2104,7 +2096,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } else { - Matrix c(this->GetNumRows(), a.GetNumCols(), (DEVICEID_TYPE)GetPreferredDeviceId()); + Matrix c(GetNumRows(), a.GetNumCols(), (DEVICEID_TYPE)GetPreferredDeviceId()); Multiply(*this, a, c); return c; } @@ -2185,15 +2177,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (!(a.GetMatrixType() == b.GetMatrixType())) NOT_IMPLEMENTED; - this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); + SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignElementProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), - this->m_GPUMatrix->AssignElementProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), + m_CPUMatrix->AssignElementProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), + m_GPUMatrix->AssignElementProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); + return *this; } @@ -2217,8 +2210,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - this->m_CPUMatrix->AddElementProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), - this->m_GPUMatrix->AddElementProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), + m_CPUMatrix->AddElementProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), + m_GPUMatrix->AddElementProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2246,8 +2239,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignElementDivisionOf(*a.m_CPUMatrix,*b.m_CPUMatrix), - this->m_GPUMatrix->AssignElementDivisionOf(*a.m_GPUMatrix,*b.m_GPUMatrix), + m_CPUMatrix->AssignElementDivisionOf(*a.m_CPUMatrix,*b.m_CPUMatrix), + m_GPUMatrix->AssignElementDivisionOf(*a.m_GPUMatrix,*b.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2274,8 +2267,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->ColumnElementMultiplyWith(*a.m_CPUMatrix), - this->m_GPUMatrix->ColumnElementMultiplyWith(*a.m_GPUMatrix), + m_CPUMatrix->ColumnElementMultiplyWith(*a.m_CPUMatrix), + m_GPUMatrix->ColumnElementMultiplyWith(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2300,8 +2293,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->RowElementMultiplyWith(*a.m_CPUMatrix), - this->m_GPUMatrix->RowElementMultiplyWith(*a.m_GPUMatrix), + m_CPUMatrix->RowElementMultiplyWith(*a.m_CPUMatrix), + m_GPUMatrix->RowElementMultiplyWith(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2326,8 +2319,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->RowElementDivideBy(*a.m_CPUMatrix), - this->m_GPUMatrix->RowElementDivideBy(*a.m_GPUMatrix), + m_CPUMatrix->RowElementDivideBy(*a.m_CPUMatrix), + m_GPUMatrix->RowElementDivideBy(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2354,8 +2347,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->ColumnElementDivideBy(*a.m_CPUMatrix), - this->m_GPUMatrix->ColumnElementDivideBy(*a.m_GPUMatrix), + m_CPUMatrix->ColumnElementDivideBy(*a.m_CPUMatrix), + m_GPUMatrix->ColumnElementDivideBy(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2370,10 +2363,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->ElementInverse(), - this->m_GPUMatrix->ElementInverse(), + m_CPUMatrix->ElementInverse(), + m_GPUMatrix->ElementInverse(), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->ElementInverse() + m_GPUSparseMatrix->ElementInverse() ); return (*this); @@ -2386,14 +2379,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("AssignElementInverseOf: Matrix a is empty."); DecideAndMoveToRightDevice(a, *this); - this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); + SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignElementInverseOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignElementInverseOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignElementInverseOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignElementInverseOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignElementInverseOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignElementInverseOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2404,10 +2397,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceSigmoid(), - this->m_GPUMatrix->InplaceSigmoid(), + m_CPUMatrix->InplaceSigmoid(), + m_GPUMatrix->InplaceSigmoid(), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->InplaceSigmoid() + m_GPUSparseMatrix->InplaceSigmoid() ); return (*this); @@ -2421,10 +2414,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignSigmoidOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignSigmoidOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignSigmoidOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignSigmoidOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignSigmoidOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignSigmoidOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2436,10 +2429,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceLinearRectifierDerivative(), - this->m_GPUMatrix->InplaceLinearRectifierDerivative(), + m_CPUMatrix->InplaceLinearRectifierDerivative(), + m_GPUMatrix->InplaceLinearRectifierDerivative(), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->InplaceLinearRectifierDerivative() + m_GPUSparseMatrix->InplaceLinearRectifierDerivative() ); return (*this); @@ -2453,10 +2446,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2468,8 +2461,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceSigmoidDerivative(), - this->m_GPUMatrix->InplaceSigmoidDerivative(), + m_CPUMatrix->InplaceSigmoidDerivative(), + m_GPUMatrix->InplaceSigmoidDerivative(), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2485,8 +2478,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignSigmoidDerivativeOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignSigmoidDerivativeOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignSigmoidDerivativeOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignSigmoidDerivativeOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2506,8 +2499,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix, searchInCol), - this->m_GPUMatrix->AssignNumOfDiff(*a.m_GPUMatrix, *b.m_GPUMatrix, searchInCol), + m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix, searchInCol), + m_GPUMatrix->AssignNumOfDiff(*a.m_GPUMatrix, *b.m_GPUMatrix, searchInCol), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2520,10 +2513,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceTanh(), - this->m_GPUMatrix->InplaceTanh(), + m_CPUMatrix->InplaceTanh(), + m_GPUMatrix->InplaceTanh(), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->InplaceTanh() + m_GPUSparseMatrix->InplaceTanh() ); return (*this); @@ -2537,10 +2530,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignTanhOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignTanhOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignTanhOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignTanhOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignTanhOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignTanhOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2552,8 +2545,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceLogSoftmax(isColWise), - this->m_GPUMatrix->InplaceLogSoftmax(isColWise), + m_CPUMatrix->InplaceLogSoftmax(isColWise), + m_GPUMatrix->InplaceLogSoftmax(isColWise), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2571,8 +2564,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignLogSoftmaxOf(*a.m_CPUMatrix,isColWise), - this->m_GPUMatrix->AssignLogSoftmaxOf(*a.m_GPUMatrix,isColWise), + m_CPUMatrix->AssignLogSoftmaxOf(*a.m_CPUMatrix,isColWise), + m_GPUMatrix->AssignLogSoftmaxOf(*a.m_GPUMatrix,isColWise), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2586,8 +2579,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceHardmax(isColWise), - this->m_GPUMatrix->InplaceHardmax(isColWise), + m_CPUMatrix->InplaceHardmax(isColWise), + m_GPUMatrix->InplaceHardmax(isColWise), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2605,8 +2598,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignHardmaxOf(*a.m_CPUMatrix, isColWise), - this->m_GPUMatrix->AssignHardmaxOf(*a.m_GPUMatrix, isColWise), + m_CPUMatrix->AssignHardmaxOf(*a.m_CPUMatrix, isColWise), + m_GPUMatrix->AssignHardmaxOf(*a.m_GPUMatrix, isColWise), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2619,10 +2612,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceSqrt(), - this->m_GPUMatrix->InplaceSqrt(), + m_CPUMatrix->InplaceSqrt(), + m_GPUMatrix->InplaceSqrt(), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->InplaceSqrt() + m_GPUSparseMatrix->InplaceSqrt() ); return *this; @@ -2639,10 +2632,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignSqrtOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignSqrtOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignSqrtOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignSqrtOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignSqrtOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignSqrtOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2654,10 +2647,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceExp(), - this->m_GPUMatrix->InplaceExp(), + m_CPUMatrix->InplaceExp(), + m_GPUMatrix->InplaceExp(), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->InplaceExp() + m_GPUSparseMatrix->InplaceExp() ); return *this; @@ -2675,10 +2668,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignExpOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignExpOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignExpOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignExpOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignExpOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignExpOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2690,10 +2683,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - this->m_CPUMatrix->InplaceAbs(), - this->m_GPUMatrix->InplaceAbs(), + m_CPUMatrix->InplaceAbs(), + m_GPUMatrix->InplaceAbs(), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->InplaceAbs() + m_GPUSparseMatrix->InplaceAbs() ); return *this; @@ -2710,10 +2703,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignAbsOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignAbsOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignAbsOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignAbsOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignAbsOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignAbsOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2725,10 +2718,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceLog(), - this->m_GPUMatrix->InplaceLog(), + m_CPUMatrix->InplaceLog(), + m_GPUMatrix->InplaceLog(), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->InplaceLog() + m_GPUSparseMatrix->InplaceLog() ); return *this; @@ -2740,7 +2733,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceLog10(), + m_CPUMatrix->InplaceLog10(), NOT_IMPLEMENTED, NOT_IMPLEMENTED, NOT_IMPLEMENTED @@ -2760,10 +2753,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignLogOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignLogOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignLogOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignLogOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2780,10 +2773,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignLog10Of(*a.m_CPUMatrix), + m_CPUMatrix->AssignLog10Of(*a.m_CPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix) + m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix) ); return *this; @@ -2795,8 +2788,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceCosine(), - this->m_GPUMatrix->InplaceCosine(), + m_CPUMatrix->InplaceCosine(), + m_GPUMatrix->InplaceCosine(), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2815,8 +2808,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignCosineOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignCosineOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignCosineOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignCosineOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2830,8 +2823,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceNegativeSine(), - this->m_GPUMatrix->InplaceNegativeSine(), + m_CPUMatrix->InplaceNegativeSine(), + m_GPUMatrix->InplaceNegativeSine(), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2850,8 +2843,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignNegativeSineOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignNegativeSineOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignNegativeSineOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignNegativeSineOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -2878,10 +2871,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceTruncate(threshold), - this->m_GPUMatrix->InplaceTruncate(threshold), - this->m_CPUSparseMatrix->InplaceTruncate(threshold), - this->m_GPUSparseMatrix->InplaceTruncate(threshold) + m_CPUMatrix->InplaceTruncate(threshold), + m_GPUMatrix->InplaceTruncate(threshold), + m_CPUSparseMatrix->InplaceTruncate(threshold), + m_GPUSparseMatrix->InplaceTruncate(threshold) ); return *this; @@ -2898,7 +2891,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { NOT_IMPLEMENTED, NOT_IMPLEMENTED, NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->InplaceTranspose() + m_GPUSparseMatrix->InplaceTranspose() ); } @@ -2915,10 +2908,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceSoftThreshold(threshold), - this->m_GPUMatrix->InplaceSoftThreshold(threshold), - this->m_CPUSparseMatrix->InplaceSoftThreshold(threshold), - this->m_GPUSparseMatrix->InplaceSoftThreshold(threshold) + m_CPUMatrix->InplaceSoftThreshold(threshold), + m_GPUMatrix->InplaceSoftThreshold(threshold), + m_CPUSparseMatrix->InplaceSoftThreshold(threshold), + m_GPUSparseMatrix->InplaceSoftThreshold(threshold) ); return *this; @@ -2943,10 +2936,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceTruncateBottom(threshold), - this->m_GPUMatrix->InplaceTruncateBottom(threshold), - this->m_CPUSparseMatrix->InplaceTruncateBottom(threshold), - this->m_GPUSparseMatrix->InplaceTruncateBottom(threshold) + m_CPUMatrix->InplaceTruncateBottom(threshold), + m_GPUMatrix->InplaceTruncateBottom(threshold), + m_CPUSparseMatrix->InplaceTruncateBottom(threshold), + m_GPUSparseMatrix->InplaceTruncateBottom(threshold) ); return *this; @@ -2974,17 +2967,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { (*this) = a; return *this; } - } + } DecideAndMoveToRightDevice(a, *this); SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignTruncateBottomOf(*a.m_CPUMatrix, threshold), - this->m_GPUMatrix->AssignTruncateBottomOf(*a.m_GPUMatrix, threshold), + m_CPUMatrix->AssignTruncateBottomOf(*a.m_CPUMatrix, threshold), + m_GPUMatrix->AssignTruncateBottomOf(*a.m_GPUMatrix, threshold), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignTruncateBottomOf(*a.m_GPUSparseMatrix, threshold) + m_GPUSparseMatrix->AssignTruncateBottomOf(*a.m_GPUSparseMatrix, threshold) ); return *this; @@ -3010,10 +3003,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->InplaceTruncateTop(threshold), - this->m_GPUMatrix->InplaceTruncateTop(threshold), - this->m_CPUSparseMatrix->InplaceTruncateTop(threshold), - this->m_GPUSparseMatrix->InplaceTruncateTop(threshold) + m_CPUMatrix->InplaceTruncateTop(threshold), + m_GPUMatrix->InplaceTruncateTop(threshold), + m_CPUSparseMatrix->InplaceTruncateTop(threshold), + m_GPUSparseMatrix->InplaceTruncateTop(threshold) ); return *this; @@ -3032,7 +3025,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { (*this) = a; return *this; } - } + } else { if (!isfinite(threshold)) @@ -3040,17 +3033,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { (*this) = a; return *this; } - } + } DecideAndMoveToRightDevice(a, *this); SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignTruncateTopOf(*a.m_CPUMatrix, threshold), - this->m_GPUMatrix->AssignTruncateTopOf(*a.m_GPUMatrix, threshold), + m_CPUMatrix->AssignTruncateTopOf(*a.m_CPUMatrix, threshold), + m_GPUMatrix->AssignTruncateTopOf(*a.m_GPUMatrix, threshold), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->AssignTruncateTopOf(*a.m_GPUSparseMatrix, threshold) + m_GPUSparseMatrix->AssignTruncateTopOf(*a.m_GPUSparseMatrix, threshold) ); return *this; @@ -3065,10 +3058,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->SetToZeroIfAbsLessThan(threshold), - this->m_GPUMatrix->SetToZeroIfAbsLessThan(threshold), + m_CPUMatrix->SetToZeroIfAbsLessThan(threshold), + m_GPUMatrix->SetToZeroIfAbsLessThan(threshold), NOT_IMPLEMENTED, - this->m_GPUSparseMatrix->SetToZeroIfAbsLessThan(threshold) + m_GPUSparseMatrix->SetToZeroIfAbsLessThan(threshold) ); return *this; @@ -3083,13 +3076,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - return this->m_CPUMatrix->SumOfElements(), - return this->m_GPUMatrix->SumOfElements(), - return this->m_CPUSparseMatrix->SumOfElements(), - return this->m_GPUSparseMatrix->SumOfElements() + return m_CPUMatrix->SumOfElements(), + return m_GPUMatrix->SumOfElements(), + return m_CPUSparseMatrix->SumOfElements(), + return m_GPUSparseMatrix->SumOfElements() ); - - } + } template Matrix& Matrix::AssignSumOfElements(const Matrix& a) @@ -3097,7 +3089,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (a.IsEmpty()) LogicError("AssignSumOfElements: Matrix a is empty."); - //WARNING: a and this must have same type + // WARNING: a and this must have same type if (!(GetMatrixType() == a.GetMatrixType())) NOT_IMPLEMENTED; @@ -3105,8 +3097,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignSumOfElements(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignSumOfElements(*a.m_GPUMatrix), + m_CPUMatrix->AssignSumOfElements(*a.m_CPUMatrix), + m_GPUMatrix->AssignSumOfElements(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -3117,18 +3109,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { template DeviceBoundNumber Matrix::Sum_AsDeviceBoundNum() const { - DeviceBoundNumber result; + DeviceBoundNumber result; DISPATCH_MATRIX_ON_FLAG(this, nullptr, - ElemType* val = new ElemType; *val = this->m_CPUMatrix->SumOfElements(); result.ShallowCopyFrom(val,-1); return result, + ElemType* val = new ElemType; *val = m_CPUMatrix->SumOfElements(); result.ShallowCopyFrom(val,-1); return result, return m_GPUMatrix->Sum_AsDeviceBoundNum(), NOT_IMPLEMENTED, NOT_IMPLEMENTED - ); - - return result; - } + ); + } //sum of all elements template @@ -3139,12 +3129,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - return this->m_CPUMatrix->SumOfAbsElements(), - return this->m_GPUMatrix->SumOfAbsElements(), + return m_CPUMatrix->SumOfAbsElements(), + return m_GPUMatrix->SumOfAbsElements(), NOT_IMPLEMENTED, - return this->m_GPUSparseMatrix->SumOfAbsElements() + return m_GPUSparseMatrix->SumOfAbsElements() ); - } + } //sum of all elements template @@ -3155,8 +3145,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - return this->m_CPUMatrix->LogAddSumOfElements(), - return this->m_GPUMatrix->LogAddSumOfElements(), + return m_CPUMatrix->LogAddSumOfElements(), + return m_GPUMatrix->LogAddSumOfElements(), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -3195,12 +3185,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, &c, - this->m_CPUMatrix->VectorNorm1(*c.m_CPUMatrix,isColWise), - this->m_GPUMatrix->VectorNorm1(*c.m_GPUMatrix,isColWise), + m_CPUMatrix->VectorNorm1(*c.m_CPUMatrix,isColWise), + m_GPUMatrix->VectorNorm1(*c.m_GPUMatrix,isColWise), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } + } template Matrix& Matrix::AssignVectorNorm1Of(Matrix& a, const bool isColWise) @@ -3220,12 +3210,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, &c, - this->m_CPUMatrix->VectorNorm2(*c.m_CPUMatrix,isColWise), - this->m_GPUMatrix->VectorNorm2(*c.m_GPUMatrix,isColWise), + m_CPUMatrix->VectorNorm2(*c.m_CPUMatrix,isColWise), + m_GPUMatrix->VectorNorm2(*c.m_GPUMatrix,isColWise), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } + } template Matrix& Matrix::AssignVectorNorm2Of(Matrix& a, const bool isColWise) @@ -3245,12 +3235,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, &c, - this->m_CPUMatrix->VectorNormInf(*c.m_CPUMatrix,isColWise), - this->m_GPUMatrix->VectorNormInf(*c.m_GPUMatrix,isColWise), + m_CPUMatrix->VectorNormInf(*c.m_CPUMatrix,isColWise), + m_GPUMatrix->VectorNormInf(*c.m_GPUMatrix,isColWise), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } + } template Matrix& Matrix::AssignVectorNormInfOf(Matrix& a, const bool isColWise) @@ -3286,8 +3276,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignKhatriRaoProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), - this->m_GPUMatrix->AssignKhatriRaoProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), + m_CPUMatrix->AssignKhatriRaoProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix), + m_GPUMatrix->AssignKhatriRaoProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -3318,8 +3308,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AddColumnReshapeProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix, transposeAColumn), - this->m_GPUMatrix->AddColumnReshapeProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix, transposeAColumn), + m_CPUMatrix->AddColumnReshapeProductOf(*a.m_CPUMatrix,*b.m_CPUMatrix, transposeAColumn), + m_GPUMatrix->AddColumnReshapeProductOf(*a.m_GPUMatrix,*b.m_GPUMatrix, transposeAColumn), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -3342,12 +3332,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - return this->m_CPUMatrix->FrobeniusNorm(), - return this->m_GPUMatrix->FrobeniusNorm(), - return this->m_CPUSparseMatrix->FrobeniusNorm(), - return this->m_GPUSparseMatrix->FrobeniusNorm() + return m_CPUMatrix->FrobeniusNorm(), + return m_GPUMatrix->FrobeniusNorm(), + return m_CPUSparseMatrix->FrobeniusNorm(), + return m_GPUSparseMatrix->FrobeniusNorm() ); - } + } template Matrix& Matrix::AssignFrobeniusNormOf(const Matrix& a) @@ -3355,7 +3345,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (a.IsEmpty()) LogicError("AssignFrobeniusNormOf: Matrix a is empty."); - this->Resize(1,1); + Resize(1,1); //WARNING: a and this must have same type if (! (GetMatrixType() == a.GetMatrixType())) @@ -3365,8 +3355,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignFrobeniusNormOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignFrobeniusNormOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignFrobeniusNormOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignFrobeniusNormOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -3382,12 +3372,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - return this->m_CPUMatrix->MatrixNormInf(), - return this->m_GPUMatrix->MatrixNormInf(), + return m_CPUMatrix->MatrixNormInf(), + return m_GPUMatrix->MatrixNormInf(), NOT_IMPLEMENTED, - return this->m_GPUSparseMatrix->MatrixNormInf() + return m_GPUSparseMatrix->MatrixNormInf() ); - } + } template ElemType Matrix::MatrixNorm1() const @@ -3397,13 +3387,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - return this->m_CPUMatrix->MatrixNorm1(), - return this->m_GPUMatrix->MatrixNorm1(), + return m_CPUMatrix->MatrixNorm1(), + return m_GPUMatrix->MatrixNorm1(), NOT_IMPLEMENTED, - return this->m_GPUSparseMatrix->MatrixNorm1() + return m_GPUSparseMatrix->MatrixNorm1() ); - - } + } template ElemType Matrix::MatrixNorm0() const @@ -3413,12 +3402,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - return this->m_CPUMatrix->MatrixNorm0(), - return this->m_GPUMatrix->MatrixNorm0(), + return m_CPUMatrix->MatrixNorm0(), + return m_GPUMatrix->MatrixNorm0(), NOT_IMPLEMENTED, - return this->m_GPUSparseMatrix->MatrixNorm0() + return m_GPUSparseMatrix->MatrixNorm0() ); - } + } template Matrix& Matrix::AssignSignOf(const Matrix& a) @@ -3435,8 +3424,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AssignSignOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AssignSignOf(*a.m_GPUMatrix), + m_CPUMatrix->AssignSignOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignSignOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -3456,8 +3445,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->AddSignOf(*a.m_CPUMatrix), - this->m_GPUMatrix->AddSignOf(*a.m_GPUMatrix), + m_CPUMatrix->AddSignOf(*a.m_CPUMatrix), + m_GPUMatrix->AddSignOf(*a.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -3478,12 +3467,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, &maxValues, - this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise); maxIndexes.SetDataLocation(CPU, DENSE), - this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise); maxIndexes.SetDataLocation(GPU, DENSE), + m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise); maxIndexes.SetDataLocation(CPU, DENSE), + m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise); maxIndexes.SetDataLocation(GPU, DENSE), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } template @@ -3498,8 +3486,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, &maxValues, - this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(CPU, DENSE), - this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(GPU, DENSE), + m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(CPU, DENSE), + m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(GPU, DENSE), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -3517,12 +3505,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, &minValues, - this->m_CPUMatrix->VectorMin(*minIndexes.m_CPUMatrix,*minValues.m_CPUMatrix,isColWise); minIndexes.SetDataLocation(CPU, DENSE), - this->m_GPUMatrix->VectorMin(*minIndexes.m_GPUMatrix,*minValues.m_GPUMatrix,isColWise); minIndexes.SetDataLocation(GPU, DENSE), + m_CPUMatrix->VectorMin(*minIndexes.m_CPUMatrix,*minValues.m_CPUMatrix,isColWise); minIndexes.SetDataLocation(CPU, DENSE), + m_GPUMatrix->VectorMin(*minIndexes.m_GPUMatrix,*minValues.m_GPUMatrix,isColWise); minIndexes.SetDataLocation(GPU, DENSE), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } #pragma endregion Member BLAS Functions @@ -3532,7 +3519,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { template wchar_t* Matrix::GetMatrixName() const { - return this->m_baseMatrix->GetMatrixName(); + return m_baseMatrix->GetMatrixName(); } template @@ -3542,23 +3529,23 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (GetMatrixType() == MatrixType::DENSE) { - this->m_CPUMatrix->SetMatrixName(s); - this->m_GPUMatrix->SetMatrixName(s); + m_CPUMatrix->SetMatrixName(s); + m_GPUMatrix->SetMatrixName(s); } else if (GetMatrixType() == MatrixType::SPARSE) { - this->m_CPUSparseMatrix->SetMatrixName(s); - this->m_GPUSparseMatrix->SetMatrixName(s); + m_CPUSparseMatrix->SetMatrixName(s); + m_GPUSparseMatrix->SetMatrixName(s); } } else { DISPATCH_MATRIX_ON_FLAG(this, nullptr, - this->m_CPUMatrix->SetMatrixName(s), - this->m_GPUMatrix->SetMatrixName(s), - this->m_CPUSparseMatrix->SetMatrixName(s), - this->m_GPUSparseMatrix->SetMatrixName(s) + m_CPUMatrix->SetMatrixName(s), + m_GPUMatrix->SetMatrixName(s), + m_CPUSparseMatrix->SetMatrixName(s), + m_GPUSparseMatrix->SetMatrixName(s) ); } } @@ -3578,8 +3565,37 @@ namespace Microsoft { namespace MSR { namespace CNTK { ); } - //if different and prefered devices are the same, move to preferred device. - //other wise GPU>CPU and if both are GPU move to a's preferred device + // bring two matrices onto the same device + // If different and prefered devices are the same, move to preferred device. + // Otherwise GPU takes precedence over CPU, and if both are GPU move to a's device. + // The inputs are only distinguished in that a's GPU takes precedence over b's in case they differ. + // TODO: This is called somewhat inconsistently, sometimes with a=*this, sometimes with b=*this. + template + void Matrix::DecideAndMoveToRightDevice(const Matrix &a, const Matrix &b) + { + int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId(); + if (deviceIdA == deviceIdB) + return; + + int preferredDeviceIdA = a.GetPreferredDeviceId(), preferredDeviceIdB = b.GetPreferredDeviceId(); + + if (preferredDeviceIdA == preferredDeviceIdB) // both prefer the same device: move to preferred + { + a._transferToDevice(preferredDeviceIdA); + b._transferToDevice(preferredDeviceIdA); + } + else if (deviceIdA != CPUDEVICE) // one of them lives on GPU: use that + { + b._transferToDevice(deviceIdA); + } + else + { + a._transferToDevice(deviceIdB); + } + } + + // same but for 3 matrices + // If b and c are both on the same GPU then a will be forced to go there; otherwise a's GPU takes precedence, then b's. template void Matrix::DecideAndMoveToRightDevice(const Matrix &a, const Matrix &b, const Matrix &c) { @@ -3589,17 +3605,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { int preferredDeviceIdA = a.GetPreferredDeviceId(), preferredDeviceIdB = b.GetPreferredDeviceId(), preferredDeviceIdC = c.GetPreferredDeviceId(); - if (preferredDeviceIdA == preferredDeviceIdB && preferredDeviceIdA == preferredDeviceIdC) //move to preferred + if (preferredDeviceIdA == preferredDeviceIdB && preferredDeviceIdA == preferredDeviceIdC) { a._transferToDevice(preferredDeviceIdA); b._transferToDevice(preferredDeviceIdA); c._transferToDevice(preferredDeviceIdA); } - else if (deviceIdB == deviceIdC && deviceIdB != CPUDEVICE) + else if (deviceIdB == deviceIdC && deviceIdB != CPUDEVICE) // TODO: why not the other two combinations? { - a._transferToDevice(deviceIdB); + a._transferToDevice(deviceIdB); // 'a' is outvoted } - else if (deviceIdA != CPUDEVICE) //use it + else if (deviceIdA != CPUDEVICE) // one of them lives on GPU: use that { b._transferToDevice(deviceIdA); c._transferToDevice(deviceIdA); @@ -3616,30 +3632,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - //if different and prefered devices are the same, move to preferred device. - //other wise GPU>CPU and if both are GPU move to a's preferred device + // same but for 4 matrices template - void Matrix::DecideAndMoveToRightDevice(const Matrix &a, const Matrix &b) + void Matrix::DecideAndMoveToRightDevice(const Matrix &a, const Matrix &b, const Matrix &c, const Matrix &d) { - int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId(); - if (deviceIdA == deviceIdB) - return; - - int preferredDeviceIdA = a.GetPreferredDeviceId(), preferredDeviceIdB = b.GetPreferredDeviceId(); - - if (preferredDeviceIdA == preferredDeviceIdB) //move to preferred - { - a._transferToDevice(preferredDeviceIdA); - b._transferToDevice(preferredDeviceIdA); - } - else if (deviceIdA != CPUDEVICE) //use it - { - b._transferToDevice(deviceIdA); - } - else - { - a._transferToDevice(deviceIdB); - } + // this function is only called for one operator, so for now we keep it imple + DecideAndMoveToRightDevice(a, b, c); + d._transferToDevice(a.GetDeviceId()); // BUGBUG: Is this correct in case a,b,c share the same preferredDevice? } template @@ -3649,7 +3648,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (to_id == from_id) //nothing to do return; - if (this->OwnBuffer()) + if (OwnBuffer()) _transferFromDeviceToDevice(from_id, to_id, ismoved, emptyTransfer); else RuntimeError("Cannot move externally owned matrices to the preferred device."); @@ -3844,16 +3843,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (IsEmpty()) LogicError("Print: Matrix is empty."); - DEVICEID_TYPE orgdevice = this->GetDeviceId(); + DEVICEID_TYPE orgdevice = GetDeviceId(); DISPATCH_MATRIX_ON_FLAG(this, nullptr, - this->m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd), - _transferToDevice(CPUDEVICE, false, false); this->m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd); _transferToDevice(orgdevice, false, false), - this->m_CPUSparseMatrix->Print(matrixName), - _transferToDevice(CPUDEVICE, false, false); this->m_CPUSparseMatrix->Print(matrixName); _transferToDevice(orgdevice, false, false) + m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd), + _transferToDevice(CPUDEVICE, false, false); m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd); _transferToDevice(orgdevice, false, false), + m_CPUSparseMatrix->Print(matrixName), + _transferToDevice(CPUDEVICE, false, false); m_CPUSparseMatrix->Print(matrixName); _transferToDevice(orgdevice, false, false) ); - } template @@ -4009,11 +4007,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { template Matrix& Matrix::AssignSoftmaxSum(const Matrix& a, const Matrix& softmax) { - this->Resize(1, 1); - if (this->GetDeviceId() < 0) - a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *this->m_CPUMatrix); + Resize(1, 1); + if (GetDeviceId() < 0) + a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *m_CPUMatrix); else - a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *this->m_GPUMatrix); + a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *m_GPUMatrix); return *this; } @@ -4023,11 +4021,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { //if (a.GetMatrixType() != MatrixType::SPARSE) // NOT_IMPLEMENTED; - this->Resize(1, 1); - if (this->GetDeviceId() < 0) - a.m_CPUMatrix->AssignNCEUnnormalizedEval(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, *this->m_CPUMatrix); + Resize(1, 1); + if (GetDeviceId() < 0) + a.m_CPUMatrix->AssignNCEUnnormalizedEval(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, *m_CPUMatrix); else - a.m_GPUMatrix->AssignNCEUnnormalizedEval(*b.m_GPUMatrix, *c.m_GPUMatrix, *this->m_GPUMatrix); + a.m_GPUMatrix->AssignNCEUnnormalizedEval(*b.m_GPUMatrix, *c.m_GPUMatrix, *m_GPUMatrix); return *this; } @@ -4037,24 +4035,24 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (a.IsEmpty() || b.IsEmpty() || c.IsEmpty()) LogicError("AssignNoiseContrastiveEstimation: one of the input matrices is empty."); - if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != this->GetDeviceId()) + if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != GetDeviceId()) NOT_IMPLEMENTED; - this->Resize(1, 1); + Resize(1, 1); - if (this->GetDeviceId() < 0) + if (GetDeviceId() < 0) { size_t sampleCount = a.m_CPUMatrix->GetNumElements() / a.m_CPUMatrix->GetNumRows(); tmp.Resize(a.GetNumRows() / 2, sampleCount); a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix, - *bias.m_CPUMatrix, *tmp.m_CPUMatrix, *this->m_CPUMatrix); + *bias.m_CPUMatrix, *tmp.m_CPUMatrix, *m_CPUMatrix); } else { size_t sampleCount = a.m_GPUMatrix->GetNumElements() / a.m_GPUMatrix->GetNumRows(); tmp.Resize(a.GetNumRows() / 2, sampleCount); a.m_GPUMatrix->AssignNoiseContrastiveEstimation(*b.m_GPUMatrix, *c.m_GPUMatrix, - *bias.m_GPUMatrix, sampleCount, *tmp.m_GPUMatrix, *this->m_GPUMatrix); + *bias.m_GPUMatrix, sampleCount, *tmp.m_GPUMatrix, *m_GPUMatrix); } return *this; } @@ -4065,13 +4063,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (a.IsEmpty() || b.IsEmpty() || c.IsEmpty()) LogicError("AssignNoiseContrastiveEstimation: one of the input matrices is empty."); - if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != this->GetDeviceId()) + if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != GetDeviceId()) NOT_IMPLEMENTED; assert(tmp.GetNumRows() == a.GetNumRows() / 2); - if (this->GetDeviceId() < 0) + if (GetDeviceId() < 0) { - //samples gradient hidden embedding embedding/hidden + // samples gradient hidden embedding embedding/hidden a.m_CPUMatrix->AssignNCEDerivative(*tmp.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, inputIndex, *m_CPUMatrix); } else @@ -4496,7 +4494,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } /// c += alpha * (a-b) @@ -4617,7 +4614,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { NOT_IMPLEMENTED, *c.m_GPUSparseMatrix = (*a.m_GPUSparseMatrix)*alpha ); - } /// Matrix-scalar multiply with col-major matrices: a = alpha * a @@ -4636,7 +4632,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { NOT_IMPLEMENTED, GPUSparseMatrix::Scale(alpha,*a.m_GPUSparseMatrix) ); - } /// Matrix scalar matrix multiply with col-major matrices: a = alpha[0,0] * a @@ -4660,7 +4655,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } template @@ -4683,7 +4677,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } template @@ -4695,7 +4688,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { DecideAndMoveToRightDevice(a,b); if (a.GetMatrixType() == b.GetMatrixType()) - { + { DISPATCH_MATRIX_ON_FLAG(&a, nullptr, return CPUMatrix::InnerProductOfMatrices(*a.m_CPUMatrix,*b.m_CPUMatrix), @@ -4703,9 +4696,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { NOT_IMPLEMENTED, NOT_IMPLEMENTED ); - } - else - { + } + else + { DISPATCH_MATRIX_ON_FLAG(&a, nullptr, NOT_IMPLEMENTED, @@ -4722,7 +4715,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (a.IsEmpty() || b.IsEmpty()) LogicError("InnerProductOfMatrices: one of the input matrices is empty."); - this->Resize(1,1); + Resize(1,1); DecideAndMoveToRightDevice(a, b, *this); @@ -4732,8 +4725,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(&a, this, - this->m_CPUMatrix->SetValue(CPUMatrix::InnerProductOfMatrices(*a.m_CPUMatrix,*b.m_CPUMatrix)), - this->m_GPUMatrix->AssignInnerProductOfMatrices(*a.m_GPUMatrix,*b.m_GPUMatrix), + m_CPUMatrix->SetValue(CPUMatrix::InnerProductOfMatrices(*a.m_CPUMatrix,*b.m_CPUMatrix)), + m_GPUMatrix->AssignInnerProductOfMatrices(*a.m_GPUMatrix,*b.m_GPUMatrix), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -4955,12 +4948,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (!(a.GetMatrixType() == b.GetMatrixType())) NOT_IMPLEMENTED; - this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); + SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber), - this->m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber), + m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber), + m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -5012,8 +5005,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index), - this->m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index), + m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index), + m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -5060,12 +5053,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (!(a.GetMatrixType() == b.GetMatrixType())) NOT_IMPLEMENTED; - this->SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); + SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift), - this->m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift), + m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift), + m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -5139,12 +5132,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (label.GetNumCols() != gamma.GetNumCols() || label.GetNumRows() != gamma.GetNumRows()) LogicError("DropFrame: label matrix is not in the same size as gamm matrix."); - this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false); + SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false); DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold), - this->m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold), + m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold), + m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -5167,13 +5160,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (!(label.GetMatrixType() == gamma.GetMatrixType())) NOT_IMPLEMENTED; - this->SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false); - + SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false); DISPATCH_MATRIX_ON_FLAG(this, this, - this->m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha), - this->m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha), + m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha), + m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha), NOT_IMPLEMENTED, NOT_IMPLEMENTED ); @@ -5187,6 +5179,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { const vector & regularOpDims, const array, 2> & regularStrides, const vector & reducingOpDims, const array, 2> & reducingStrides) { + DecideAndMoveToRightDevice(*this, a); + DISPATCH_MATRIX_ON_FLAG(this, this, m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides), @@ -5202,6 +5196,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { const vector & regularOpDims, const array, 3> & regularStrides, const vector & reducingOpDims, const array, 3> & reducingStrides) { + DecideAndMoveToRightDevice(*this, a, b); + DISPATCH_MATRIX_ON_FLAG(this, this, m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides), @@ -5217,6 +5213,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { const vector & regularOpDims, const array, 4> & regularStrides, const vector & reducingOpDims, const array, 4> & reducingStrides) { + DecideAndMoveToRightDevice(*this, a, b, c); + DISPATCH_MATRIX_ON_FLAG(this, this, m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides), diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index b1a2aa9fa..6f4eaa26b 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -74,6 +74,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void _transferToDevice(int id_to, bool ismoved=true, bool emptyTransfer=false) const; static void DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b); static void DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b, const Matrix& c); + static void DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b, const Matrix& c, const Matrix& d); static void CopyElementsFromDenseToSparse(CPUMatrix& from, CPUSparseMatrix& dest); public: From 9d33fc1efa6de42519469ab207549bf887b35d3f Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 18 Dec 2015 10:55:30 -0800 Subject: [PATCH 16/19] added a specialization of a tensor op for inner dimensions where all strides are 1. Seems not quite enough for really efficient unrolling though --- Source/Math/CPUMatrix.cpp | 145 ++++++++++++++++++++++--------------- Source/Math/Matrix.h | 6 +- Source/Math/TensorView.cpp | 9 ++- 3 files changed, 98 insertions(+), 62 deletions(-) diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index ca08faf71..9a9a52940 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -352,7 +352,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { auto& us = *this; -#pragma omp parallel for +#pragma omp parallel for for (long j = 0; j threshold) @@ -4388,7 +4388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { ElemType v = alpha*a(0,0); long m=(long)c.GetNumRows(), n=(long)c.GetNumCols(); -#pragma omp parallel for +#pragma omp parallel for for (long j=0; j + template struct TensorOpIteration { static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, const std::vector & regularOpDims, const std::array, N> & regularStrides, const std::vector & reducingOpDims, const std::array, N> & reducingStrides) { - // TODO: if leading dim is all-ones, we can hard-code the loop and hope the compiler vectorizes for us // non-scalar case: still nested result loops left array strides; for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled @@ -5576,7 +5575,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;) { // need to descend into one loop deeper - TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); // advance the pointers for (size_t i = 0; i < N; i++) pointers[i] += strides[i]; @@ -5584,8 +5583,30 @@ namespace Microsoft { namespace MSR { namespace CNTK { } }; - template - struct TensorOpIteration + // Special version: All innermost strides are 1, and there is no further reduction. Compiler can use SSE. + // This is a very common case, e.g. computing the Sigmoid. + template + struct TensorOpIteration + { + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, + const std::vector & regularOpDims, const std::array, N> & regularStrides, + const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + { + size_t K = regularOpDims[0]; +#pragma omp parallel for + for (int k = 0; k < (int)K; k++) + { + // need to descend into one loop deeper + TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + // advance the pointers + for (size_t i = 0; i < N; i++) + pointers[i] += 1; // instead of strides[i]; + } + } + }; + + template + struct TensorOpIteration { static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, const std::vector &, const std::array, N> &, @@ -5604,16 +5625,26 @@ namespace Microsoft { namespace MSR { namespace CNTK { // tensor operation with k+1 dimensions (-1 means scalar) template - static inline void TensorOpWithRegularLoop(ElemType beta, const array & pointers, ElemType alpha, const OPFN & opfn, - const std::vector & regularOpDims, const std::array, N> & regularStrides, - const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + static void TensorOpWithRegularLoop(ElemType beta, const array & pointers, ElemType alpha, const OPFN & opfn, + const std::vector & regularOpDims, const std::array, N> & regularStrides, + const std::vector & reducingOpDims, const std::array, N> & reducingStrides) { size_t dims = reducingOpDims.size(); switch (dims) { - case 2: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 1: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 0: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 2: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 1: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 0: + { + // if all leading dimensions are 1, we can let the compiler do some unrolling + bool leadingAllOne = true; + for (size_t i = 0; i < N; i++) + leadingAllOne &= k >= 0 && regularStrides[i][0] == 1; + if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions + return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else + return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + } default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims); } } @@ -5621,10 +5652,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { // tensor operation, generalized in number of arguments, operation already provided as a lambda // This function now expands into different k. template - static inline void TensorOpWithFn(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, - const std::array & offsets, - const std::vector & regularOpDims, const std::array, N> & regularStrides, - const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + static void TensorOpWithFn(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, + const std::array & offsets, + const std::vector & regularOpDims, const std::array, N> & regularStrides, + const std::vector & reducingOpDims, const std::array, N> & reducingStrides) { for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled pointers[i] += offsets[i]; diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 6f4eaa26b..837cb6834 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -6,9 +6,8 @@ // TODO: // - remove empty-matrix checks: if an op is well-defined with empty matrices, then do it -// - Resize() must be cheap if it does nothing (I already did that for CPU, still to be done for GPU) -// - an overload for Resize() to match another matrix -// - need a way to grow a minibatch matrix without destroying its content, something like PushColumns() +// - Resize() must be cheap if it does nothing (I already did that for CPU; already done for GPU?) + #pragma once #include "Basics.h" @@ -170,6 +169,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { ElemType RmsProp(Matrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier); void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true); //by default we only reallocate if need to grow + void Resize(const Matrix& other) { Resize(other.GetNumRows(), other.GetNumCols()); } void VerifySize(size_t rows, size_t cols) { m_baseMatrix->VerifySize(rows, cols); diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index 0676b014d..e5626cd81 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -235,8 +235,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { m2.SetValue(2, 1, { 42, 13 }); - // unary ops - m3.Resize(2, 3); + m3.Resize(m1); + + // regular zip (just add m1 to itself) + TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m1), 1); + m3.Print(); + + // unary op TensorView(m3).DoSqrtOf(0, TensorView(m1), 1); m3.Print(); From c343e98ae9a56850073f00ba64b4f67aac50c8b6 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 18 Dec 2015 13:30:33 -0800 Subject: [PATCH 17/19] further optimized the most frequent tensor loops (1-stride loops for unary and binary ops), but still not seeing 4-way SSE parallelism --- Source/Math/CPUMatrix.cpp | 103 +++++++++++++++++++++++++------------ Source/Math/TensorView.cpp | 16 ++++-- 2 files changed, 80 insertions(+), 39 deletions(-) diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 9a9a52940..cf670a73c 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -5525,7 +5525,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // perform loop over reduction index m // This function is declared inside a wrapper struct to allow partial specialization (m = -1). - template + template struct TensorOpReduction { // reduction case (non-reduction case is specialized) @@ -5539,7 +5539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (size_t dim = reducingOpDims[(size_t)m]; dim-- > 0;) { // need to descend into one loop deeper - aggregate += TensorOpReduction::Loop(pointers, opfn, reducingOpDims, reducingStrides); + aggregate += TensorOpReduction::Loop(pointers, opfn, reducingOpDims, reducingStrides); // advance the pointers for (size_t i = 0; i < N - 1; i++) pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here @@ -5550,8 +5550,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // perform loop over reduction index m // This is the specialized version for m = -1, which terminates the recursion. - template - struct TensorOpReduction + template + struct TensorOpReduction { static inline ElemType Loop(array pointers, const OPFN & opfn, const std::vector &, const std::array, N> &) @@ -5561,7 +5561,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { }; // perform loop over regular index k and reducing index m for N operands (counting the output) - template + template struct TensorOpIteration { static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, @@ -5575,7 +5575,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (size_t dim = regularOpDims[(size_t)k]; dim--> 0;) { // need to descend into one loop deeper - TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); // advance the pointers for (size_t i = 0; i < N; i++) pointers[i] += strides[i]; @@ -5583,37 +5583,72 @@ namespace Microsoft { namespace MSR { namespace CNTK { } }; - // Special version: All innermost strides are 1, and there is no further reduction. Compiler can use SSE. - // This is a very common case, e.g. computing the Sigmoid. - template - struct TensorOpIteration + // Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE. + // This is a very common case, e.g. adding vectors or computing the Sigmoid. + template + struct TensorOpIteration { - static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, - const std::vector & regularOpDims, const std::array, N> & regularStrides, - const std::vector & reducingOpDims, const std::array, N> & reducingStrides) + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, + const std::vector & regularOpDims, const std::array, 3> & regularStrides, + const std::vector & reducingOpDims, const std::array, 3> & reducingStrides) { + ElemType* pa = pointers[0]; + ElemType* pb = pointers[1]; + ElemType* pc = pointers[2]; size_t K = regularOpDims[0]; + // special-case beta and alpha to allow the compiler to short-circuit it + if (beta != 0) #pragma omp parallel for - for (int k = 0; k < (int)K; k++) - { - // need to descend into one loop deeper - TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - // advance the pointers - for (size_t i = 0; i < N; i++) - pointers[i] += 1; // instead of strides[i]; - } + for (int k = 0; k < (int)K; k++) + TensorOpIteration::Loop(beta, array { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else if (alpha != 1) +#pragma omp parallel for + for (int k = 0; k < (int)K; k++) + TensorOpIteration::Loop(0, array { pa + k, pb + k, pc + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else +#pragma omp parallel for + for (int k = 0; k < (int)K; k++) + TensorOpIteration::Loop(0, array { pa + k, pb + k, pc + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + // TODO: somehow this does not use 4-way parallelism with SSE (VS 2013), and the signedness of k (required for omp) causes an extra sign-extend + // TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it? + } + }; + // and unary + template + struct TensorOpIteration + { + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, + const std::vector & regularOpDims, const std::array, 2> & regularStrides, + const std::vector & reducingOpDims, const std::array, 2> & reducingStrides) + { + ElemType* pa = pointers[0]; + ElemType* pb = pointers[1]; + size_t K = regularOpDims[0]; + // special-case beta and alpha to allow the compiler to short-circuit it + if (beta != 0) +#pragma omp parallel for + for (int k = 0; k < (int)K; k++) + TensorOpIteration::Loop(beta, array { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else if (alpha != 1) +#pragma omp parallel for + for (int k = 0; k < (int)K; k++) + TensorOpIteration::Loop(0, array { pa + k, pb + k }, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else +#pragma omp parallel for + for (int k = 0; k < (int)K; k++) + TensorOpIteration::Loop(0, array { pa + k, pb + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } }; - template - struct TensorOpIteration + template + struct TensorOpIteration { static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN & opfn, const std::vector &, const std::array, N> &, const std::vector & reducingOpDims, const std::array, N> & reducingStrides) { // we are at element level for the result: perform the op (there may still be reduction) - ElemType val = alpha * TensorOpReduction::Loop(pointers, opfn, reducingOpDims, reducingStrides); + ElemType val = alpha * TensorOpReduction::Loop(pointers, opfn, reducingOpDims, reducingStrides); // combine with previous value in target matrix, then write it out auto * pout = pointers.back(); if (beta != 0) @@ -5624,7 +5659,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { }; // tensor operation with k+1 dimensions (-1 means scalar) - template + template static void TensorOpWithRegularLoop(ElemType beta, const array & pointers, ElemType alpha, const OPFN & opfn, const std::vector & regularOpDims, const std::array, N> & regularStrides, const std::vector & reducingOpDims, const std::array, N> & reducingStrides) @@ -5632,8 +5667,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t dims = reducingOpDims.size(); switch (dims) { - case 2: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 1: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 2: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 1: return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); case 0: { // if all leading dimensions are 1, we can let the compiler do some unrolling @@ -5641,9 +5676,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (size_t i = 0; i < N; i++) leadingAllOne &= k >= 0 && regularStrides[i][0] == 1; if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions - return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); else - return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + return TensorOpIteration::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } default: LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int)dims); } @@ -5662,11 +5697,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t dims = regularOpDims.size(); switch (dims) { - case 4: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 3: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 2: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 1: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 0: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 4: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 3: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 2: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 1: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 0: return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides); default: LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims); } } diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp index e5626cd81..7ee05770b 100644 --- a/Source/Math/TensorView.cpp +++ b/Source/Math/TensorView.cpp @@ -230,10 +230,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { Matrix m2(-1); Matrix m3(-1); { - m1.SetValue(2, 3, { 1, 2, 3, - 14, 15, 6 }); - m2.SetValue(2, 1, { 42, - 13 }); + m1.SetValue(5, 3, { 1, 2, 3, + 14, 15, 6, + 4, 5, 16, + 41, 5, 1, + 1.8, 4.5, 7 }); + m2.SetValue(5, 1, { 42, + 13, + 1968, + 3.1415f, + 7 }); m3.Resize(m1); @@ -256,7 +262,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m3.Print(); // reduction over columns - m3.Resize(2, 1); + m3.Resize(5, 1); TensorView(m3).DoSumOf(0, TensorView(m1), TensorView(m2), 1); m3.Print(); From cf7fc9fe29ef4185f67ff6f8cd37b19c4f099d8d Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Fri, 18 Dec 2015 15:17:49 -0800 Subject: [PATCH 18/19] Updated baselines for cuDNN. --- .../QuickE2E/baseline.windows.debug.gpu.txt | 2670 +++++------------ .../QuickE2E/baseline.windows.release.gpu.txt | 2554 +++++----------- 2 files changed, 1552 insertions(+), 3672 deletions(-) diff --git a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt index dafd81007..a0f43cc3a 100644 --- a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt +++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.debug.gpu.txt @@ -1,253 +1,245 @@ ------------------------------------------------------------------- Build info: - Built time: Nov 23 2015 10:00:15 - Last modified date: Mon Nov 23 09:45:21 2015 - Built by alexeyk on alexey-rz - Build Path: C:\src\cntk\MachineLearning\CNTK\ + Built time: Dec 18 2015 15:12:36 + Last modified date: Wed Dec 16 11:33:30 2015 CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + Build Branch: + Build SHA1: + Built by alexeyk on z840-01 + Build Path: C:\src\cntk\Source\CNTK\ ------------------------------------------------------------------- -running on alexey-rz at 2015/11/23 10:07:45 +running on z840-01 at 2015/12/18 15:13:39 command line: -C:\src\cntk\x64\Debug\CNTK.exe configFile=C:\src\cntk\Tests\Image\QuickE2E\cntk.config RunDir=C:\src\cntk\Tests\Image\_run DataDir=C:\src\cntk\Tests\Image\Data ConfigDir=C:\src\cntk\Tests\Image\QuickE2E DeviceId=0 +C:\src\cntk\x64\Debug\CNTK.exe configFile=QuickE2E\cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_out\gpu.txt >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=$DeviceId$ -ndlMacros=$ConfigDir$/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=$RunDir$/models/cntk.dnn - deviceId=$DeviceId$ - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = $DeviceId$ +ndlMacros = "$ConfigDir$/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "$RunDir$/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=$RunDir$/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl +test = [ + action = "test" + modelPath = "$RunDir$/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -RunDir=C:\src\cntk\Tests\Image\_run -DataDir=C:\src\cntk\Tests\Image\Data -ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_out\gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=0 -ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = Auto +ndlMacros = "QuickE2E/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl +test = [ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -RunDir=C:\src\cntk\Tests\Image\_run -DataDir=C:\src\cntk\Tests\Image\Data -ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_out\gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -configparameters: cntk.config:command=Train:Test -configparameters: cntk.config:ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -configparameters: cntk.config:DataDir=C:\src\cntk\Tests\Image\Data -configparameters: cntk.config:deviceId=0 -configparameters: cntk.config:ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl +configparameters: cntk.config:command=train:test +configparameters: cntk.config:ConfigDir=QuickE2E +configparameters: cntk.config:DataDir=Data +configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl +configparameters: cntk.config:numCPUThreads=8 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=C:\src\cntk\Tests\Image\_run -configparameters: cntk.config:Test=[ - action=test - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl +configparameters: cntk.config:RunDir=_out +configparameters: cntk.config:stderr=_out\gpu.txt +configparameters: cntk.config:test=[ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -configparameters: cntk.config:Train=[ - action=train - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +configparameters: cntk.config:train=[ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< -command: Train Test +command: train test precision = float -CNTKModelPath: C:\src\cntk\Tests\Image\_run/models/cntk.dnn -CNTKCommandTrainInfo: Train : 12 +Using 8 CPU threads +CNTKModelPath: _out/models/cntk.dnn +CNTKCommandTrainInfo: train : 12 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12 -CNTKCommandTrainBegin: Train +CNTKCommandTrainBegin: train +LockDevice: Locked GPU 0 to test availability. +LockDevice: Unlocked GPU 0 after testing. +LockDevice: Locked GPU 1 to test availability. +LockDevice: Unlocked GPU 1 after testing. +LockDevice: Locked GPU 2 to test availability. +LockDevice: Unlocked GPU 2 after testing. +LockDevice: Locked GPU 0 for exclusive use. NDLBuilder Using GPU 0 -reading uci file C:\src\cntk\Tests\Image\Data/Train.txt +Reading UCI file Data/Train.txt +Microsoft::MSR::CNTK::GPUMatrix::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 + +Post-processing network... + +3 roots: + ce = CrossEntropyWithSoftmax + err = ErrorPrediction + outputNodes.z = Plus +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation -Allocating matrices for forward propagation. - - -Printing Gradient Computation Node Order ... - -CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 1], OutputNodes.z[0, 0]) -OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1]) -OutputNodes.b[10, 1] = LearnableParameter -OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0]) -h1.y[0, 0] = Sigmoid(h1.z[0, 0]) -h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1]) -h1.b[128, 1] = LearnableParameter -h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0]) -pool2[0, 0] = AveragePooling(conv2_act.act[0, 0]) -conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0]) -conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1]) -conv2_act.convB[32, 1] = LearnableParameter -conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0]) -pool1[0, 0] = MaxPooling(conv1_act.act[0, 0]) -conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0]) -conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1]) -conv1_act.convB[16, 1] = LearnableParameter -conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0]) -featScaled[0, 0] = Scale(featScale[1, 1], features[784, 1]) -features[784, 1] = InputValue -featScale[1, 1] = LearnableParameter -conv1_act.convW[16, 25] = LearnableParameter -conv2_act.convW[32, 400] = LearnableParameter -h1.W[128, 512] = LearnableParameter -OutputNodes.W[10, 128] = LearnableParameter -labels[10, 1] = InputValue - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -256,27 +248,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node ce. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -285,27 +277,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -314,31 +306,30 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -347,27 +338,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -376,27 +367,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -405,30 +396,29 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -437,25 +427,25 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node OutputNodes.z. 13 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -464,25 +454,25 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node OutputNodes.z, final verification. +Validating for node outputNodes.z, final verification. -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -491,301 +481,39 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. +Post-processing network complete. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - -SetUniformRandomValue (GPU): creating curand object with seed 1 SGD using GPU 0. -GetTrainCriterionNodes ... -GetEvalCriterionNodes ... + +Training criterion node(s): + ce = CrossEntropyWithSoftmax + +Evaluation criterion node(s): + err = ErrorPrediction -Allocating matrices for gradient computing +Allocating matrices for forward and/or backward propagation. No PreCompute nodes found, skipping PreCompute step Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.005000 effective momentum = 0.000000 +Starting Epoch 1: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting at epoch 0 counting lines to determine record count 1000 records found @@ -793,148 +521,126 @@ starting epoch 0 at record count 0, and file position 0 already there from last epoch Starting minibatch loop. -randomordering: 11 retries for 100 elements (11.0%) to ensure window condition -randomordering: recached sequence for seed 0: 15, 33, ... - Epoch[ 1 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.37891785; EvalErr[0]PerSample = 0.93000000; TotalTime = 0.19572s; TotalTimePerSample = 1.95719ms; SamplesPerSecond = 510 -Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3789177; EvalErrPerSample = 0.93000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.226218 -Starting Epoch 2: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 15, 33, ... + Epoch[ 1 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.34708694; EvalErr[0]PerSample = 0.92000000; TotalTime = 0.2483s; SamplesPerSecond = 402.8 +Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3470869; EvalErrPerSample = 0.91999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.285798 +Starting Epoch 2: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 1 at record count 100, and file position 100 already there from last epoch Starting minibatch loop. -randomordering: 26 retries for 100 elements (26.0%) to ensure window condition -randomordering: recached sequence for seed 1: 20, 26, ... - Epoch[ 2 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.37942505; EvalErr[0]PerSample = 0.91000000; TotalTime = 0.08022s; TotalTimePerSample = 0.80224ms; SamplesPerSecond = 1246 -Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.379425; EvalErrPerSample = 0.90999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.080724 -Starting Epoch 3: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 26 retries for 100 elements (26.0%) to ensure window condition +RandomOrdering: recached sequence for seed 1: 20, 26, ... + Epoch[ 2 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.29444092; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.0975s; SamplesPerSecond = 1025.7 +Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.294441; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100328 +Starting Epoch 3: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 2 at record count 200, and file position 200 already there from last epoch Starting minibatch loop. -randomordering: 28 retries for 100 elements (28.0%) to ensure window condition -randomordering: recached sequence for seed 2: 4, 35, ... - Epoch[ 3 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.32070969; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.08246s; TotalTimePerSample = 0.82460ms; SamplesPerSecond = 1212 -Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.3207097; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.083 -Starting Epoch 4: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 28 retries for 100 elements (28.0%) to ensure window condition +RandomOrdering: recached sequence for seed 2: 4, 35, ... + Epoch[ 3 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.13786560; EvalErr[0]PerSample = 0.73000000; TotalTime = 0.0973s; SamplesPerSecond = 1027.9 +Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.1378655; EvalErrPerSample = 0.72999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100033 +Starting Epoch 4: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 3 at record count 300, and file position 300 already there from last epoch Starting minibatch loop. -randomordering: 17 retries for 100 elements (17.0%) to ensure window condition -randomordering: recached sequence for seed 3: 28, 7, ... - Epoch[ 4 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.36838959; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.08074s; TotalTimePerSample = 0.80741ms; SamplesPerSecond = 1238 -Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.3683896; EvalErrPerSample = 0.89999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.081265 -Starting Epoch 5: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition +RandomOrdering: recached sequence for seed 3: 28, 7, ... + Epoch[ 4 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.03929321; EvalErr[0]PerSample = 0.75000000; TotalTime = 0.0854s; SamplesPerSecond = 1171.3 +Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.0392931; EvalErrPerSample = 0.75; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.08801 +Starting Epoch 5: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 4 at record count 400, and file position 400 already there from last epoch Starting minibatch loop. -randomordering: 15 retries for 100 elements (15.0%) to ensure window condition -randomordering: recached sequence for seed 4: 5, 36, ... - Epoch[ 5 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.32354156; EvalErr[0]PerSample = 0.84000000; TotalTime = 0.07892s; TotalTimePerSample = 0.78921ms; SamplesPerSecond = 1267 -Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 2.3235414; EvalErrPerSample = 0.83999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.079374 -Starting Epoch 6: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 15 retries for 100 elements (15.0%) to ensure window condition +RandomOrdering: recached sequence for seed 4: 5, 36, ... + Epoch[ 5 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.77985352; EvalErr[0]PerSample = 0.53000000; TotalTime = 0.0979s; SamplesPerSecond = 1021.3 +Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 1.7798535; EvalErrPerSample = 0.52999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100739 +Starting Epoch 6: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 5 at record count 500, and file position 500 already there from last epoch Starting minibatch loop. -randomordering: 13 retries for 100 elements (13.0%) to ensure window condition -randomordering: recached sequence for seed 5: 11, 48, ... - Epoch[ 6 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.24672409; EvalErr[0]PerSample = 0.83000000; TotalTime = 0.07987s; TotalTimePerSample = 0.79865ms; SamplesPerSecond = 1252 -Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 2.2467241; EvalErrPerSample = 0.82999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.08033 -Starting Epoch 7: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition +RandomOrdering: recached sequence for seed 5: 11, 48, ... + Epoch[ 6 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.49362656; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.0967s; SamplesPerSecond = 1033.8 +Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 1.4936265; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.09948 +Starting Epoch 7: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 6 at record count 600, and file position 600 already there from last epoch Starting minibatch loop. -randomordering: 13 retries for 100 elements (13.0%) to ensure window condition -randomordering: recached sequence for seed 6: 15, 3, ... - Epoch[ 7 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.09912888; EvalErr[0]PerSample = 0.69000000; TotalTime = 0.07999s; TotalTimePerSample = 0.79993ms; SamplesPerSecond = 1250 -Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 2.0991287; EvalErrPerSample = 0.69; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.080483 -Starting Epoch 8: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition +RandomOrdering: recached sequence for seed 6: 15, 3, ... + Epoch[ 7 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.17570114; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.0982s; SamplesPerSecond = 1018.8 +Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 1.1757011; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100967 +Starting Epoch 8: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 7 at record count 700, and file position 700 already there from last epoch Starting minibatch loop. -randomordering: 22 retries for 100 elements (22.0%) to ensure window condition -randomordering: recached sequence for seed 7: 9, 19, ... - Epoch[ 8 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.01871979; EvalErr[0]PerSample = 0.61000000; TotalTime = 0.07961s; TotalTimePerSample = 0.79607ms; SamplesPerSecond = 1256 -Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 2.0187197; EvalErrPerSample = 0.61000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.080087 -Starting Epoch 9: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 22 retries for 100 elements (22.0%) to ensure window condition +RandomOrdering: recached sequence for seed 7: 9, 19, ... + Epoch[ 8 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.98662323; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.0825s; SamplesPerSecond = 1212.2 +Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 0.98662323; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.08482 +Starting Epoch 9: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 8 at record count 800, and file position 800 already there from last epoch Starting minibatch loop. -randomordering: 16 retries for 100 elements (16.0%) to ensure window condition -randomordering: recached sequence for seed 8: 8, 5, ... - Epoch[ 9 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.75549896; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.08258s; TotalTimePerSample = 0.82578ms; SamplesPerSecond = 1210 -Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 1.7554989; EvalErrPerSample = 0.34999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.083038 -Starting Epoch 10: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 16 retries for 100 elements (16.0%) to ensure window condition +RandomOrdering: recached sequence for seed 8: 8, 5, ... + Epoch[ 9 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.72003891; EvalErr[0]PerSample = 0.01000000; TotalTime = 0.0983s; SamplesPerSecond = 1017.6 +Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 0.72003889; EvalErrPerSample = 0.0099999998; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.101038 +Starting Epoch 10: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 9 at record count 900, and file position 900 already there from last epoch Starting minibatch loop. -randomordering: 16 retries for 100 elements (16.0%) to ensure window condition -randomordering: recached sequence for seed 9: 7, 10, ... - Epoch[10 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.64107086; EvalErr[0]PerSample = 0.39000000; TotalTime = 0.09024s; TotalTimePerSample = 0.90243ms; SamplesPerSecond = 1108 -Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 1.6410708; EvalErrPerSample = 0.38999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.090849 -Starting Epoch 11: learning rate per sample = 0.005000 effective momentum = 0.700000 +RandomOrdering: 16 retries for 100 elements (16.0%) to ensure window condition +RandomOrdering: recached sequence for seed 9: 7, 10, ... + Epoch[10 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.60043060; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0839s; SamplesPerSecond = 1191.9 +Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 0.60043061; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.086226 +Starting Epoch 11: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting epoch 10 at record count 1000, and file position 0 already there from last epoch Starting minibatch loop. -randomordering: 22 retries for 100 elements (22.0%) to ensure window condition -randomordering: recached sequence for seed 10: 13, 22, ... - Epoch[11 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.30029129; EvalErr[0]PerSample = 0.12000000; TotalTime = 0.08305s; TotalTimePerSample = 0.83050ms; SamplesPerSecond = 1204 -Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 1.3002913; EvalErrPerSample = 0.12; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.083644 -Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 +RandomOrdering: 22 retries for 100 elements (22.0%) to ensure window condition +RandomOrdering: recached sequence for seed 10: 13, 22, ... + Epoch[11 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.42560421; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0980s; SamplesPerSecond = 1020.2 +Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 0.42560419; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.100689 +Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting epoch 11 at record count 1100, and file position 100 already there from last epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 11: 6, 31, ... - Epoch[12 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.01696381; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.08059s; TotalTimePerSample = 0.80586ms; SamplesPerSecond = 1240 -Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0169638; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.081069 -CNTKCommandTrainEnd: Train +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 11: 6, 31, ... + Epoch[12 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.33292500; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0981s; SamplesPerSecond = 1019.0 +Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.33292499; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.101064 +CNTKCommandTrainEnd: train + +Post-processing network... + +3 roots: + ce = CrossEntropyWithSoftmax + outputNodes.z = Plus + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. - - -Printing Gradient Computation Node Order ... - -CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0]) -OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1]) -OutputNodes.b[10, 1] = LearnableParameter -OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0]) -h1.y[0, 0] = Sigmoid(h1.z[0, 0]) -h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1]) -h1.b[128, 1] = LearnableParameter -h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0]) -pool2[0, 0] = AveragePooling(conv2_act.act[0, 0]) -conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0]) -conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1]) -conv2_act.convB[32, 1] = LearnableParameter -conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0]) -pool1[0, 0] = MaxPooling(conv1_act.act[0, 0]) -conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0]) -conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1]) -conv1_act.convB[16, 1] = LearnableParameter -conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0]) -featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0]) -features[784, 0] = InputValue -featScale[1, 1] = LearnableParameter -conv1_act.convW[16, 25] = LearnableParameter -conv2_act.convW[32, 400] = LearnableParameter -h1.W[128, 512] = LearnableParameter -OutputNodes.W[10, 128] = LearnableParameter -labels[10, 0] = InputValue - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -955,15 +661,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node ce. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -984,15 +690,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1013,19 +719,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1046,15 +750,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1075,15 +777,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1104,189 +804,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1307,15 +835,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1336,15 +864,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1365,364 +893,268 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. evalNodeNames are not specified, using all the default evalnodes and training criterion nodes. + + +Allocating matrices for forward and/or backward propagation. starting epoch 0 at record count 0, and file position 0 already there from last epoch -randomordering: 11 retries for 100 elements (11.0%) to ensure window condition -randomordering: recached sequence for seed 0: 15, 33, ... -Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.87062637 Perplexity = 2.3884064 +RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 15, 33, ... +Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.29111851 Perplexity = 1.3379231 COMPLETED === Deleting last epoch data ==== Re-running from checkpoint ------------------------------------------------------------------- Build info: - Built time: Nov 23 2015 10:00:15 - Last modified date: Mon Nov 23 09:45:21 2015 - Built by alexeyk on alexey-rz - Build Path: C:\src\cntk\MachineLearning\CNTK\ - CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + Built time: Dec 18 2015 15:12:36 + Last modified date: Wed Dec 16 11:33:30 2015 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + Build Branch: + Build SHA1: + Built by alexeyk on z840-01 + Build Path: C:\src\cntk\Source\CNTK\ ------------------------------------------------------------------- -running on alexey-rz at 2015/11/23 10:35:40 +running on z840-01 at 2015/12/18 15:13:59 command line: -C:\src\cntk\x64\Debug\CNTK.exe configFile=C:\src\cntk\Tests\Image\QuickE2E\cntk.config RunDir=C:\src\cntk\Tests\Image\_run DataDir=C:\src\cntk\Tests\Image\Data ConfigDir=C:\src\cntk\Tests\Image\QuickE2E DeviceId=0 +C:\src\cntk\x64\Debug\CNTK.exe configFile=QuickE2E\cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_out\gpu.txt >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=$DeviceId$ -ndlMacros=$ConfigDir$/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=$RunDir$/models/cntk.dnn - deviceId=$DeviceId$ - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = $DeviceId$ +ndlMacros = "$ConfigDir$/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "$RunDir$/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=$RunDir$/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl +test = [ + action = "test" + modelPath = "$RunDir$/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -RunDir=C:\src\cntk\Tests\Image\_run -DataDir=C:\src\cntk\Tests\Image\Data -ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_out\gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=0 -ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = Auto +ndlMacros = "QuickE2E/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl +test = [ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -RunDir=C:\src\cntk\Tests\Image\_run -DataDir=C:\src\cntk\Tests\Image\Data -ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_out\gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -configparameters: cntk.config:command=Train:Test -configparameters: cntk.config:ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -configparameters: cntk.config:DataDir=C:\src\cntk\Tests\Image\Data -configparameters: cntk.config:deviceId=0 -configparameters: cntk.config:ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl +configparameters: cntk.config:command=train:test +configparameters: cntk.config:ConfigDir=QuickE2E +configparameters: cntk.config:DataDir=Data +configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl +configparameters: cntk.config:numCPUThreads=8 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=C:\src\cntk\Tests\Image\_run -configparameters: cntk.config:Test=[ - action=test - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl +configparameters: cntk.config:RunDir=_out +configparameters: cntk.config:stderr=_out\gpu.txt +configparameters: cntk.config:test=[ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -configparameters: cntk.config:Train=[ - action=train - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +configparameters: cntk.config:train=[ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< -command: Train Test +command: train test precision = float -CNTKModelPath: C:\src\cntk\Tests\Image\_run/models/cntk.dnn -CNTKCommandTrainInfo: Train : 12 +Using 8 CPU threads +CNTKModelPath: _out/models/cntk.dnn +CNTKCommandTrainInfo: train : 12 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12 -CNTKCommandTrainBegin: Train +CNTKCommandTrainBegin: train +LockDevice: Locked GPU 0 to test availability. +LockDevice: Unlocked GPU 0 after testing. +LockDevice: Locked GPU 1 to test availability. +LockDevice: Unlocked GPU 1 after testing. +LockDevice: Locked GPU 2 to test availability. +LockDevice: Unlocked GPU 2 after testing. +LockDevice: Locked GPU 0 for exclusive use. NDLBuilder Using GPU 0 -reading uci file C:\src\cntk\Tests\Image\Data/Train.txt -Starting from checkpoint. Load Network From File C:\src\cntk\Tests\Image\_run/models/cntk.dnn.11. +Reading UCI file Data/Train.txt +Starting from checkpoint. Load Network From File _out/models/cntk.dnn.11. + +Post-processing network... + +3 roots: + ce = CrossEntropyWithSoftmax + outputNodes.z = Plus + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. - - -Printing Gradient Computation Node Order ... - -CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0]) -OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1]) -OutputNodes.b[10, 1] = LearnableParameter -OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0]) -h1.y[0, 0] = Sigmoid(h1.z[0, 0]) -h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1]) -h1.b[128, 1] = LearnableParameter -h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0]) -pool2[0, 0] = AveragePooling(conv2_act.act[0, 0]) -conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0]) -conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1]) -conv2_act.convB[32, 1] = LearnableParameter -conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0]) -pool1[0, 0] = MaxPooling(conv1_act.act[0, 0]) -conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0]) -conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1]) -conv1_act.convB[16, 1] = LearnableParameter -conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0]) -featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0]) -features[784, 0] = InputValue -featScale[1, 1] = LearnableParameter -conv1_act.convW[16, 25] = LearnableParameter -conv2_act.convW[32, 400] = LearnableParameter -h1.W[128, 512] = LearnableParameter -OutputNodes.W[10, 128] = LearnableParameter -labels[10, 0] = InputValue - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1743,15 +1175,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node ce. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1772,15 +1204,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1801,19 +1233,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1834,15 +1264,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1863,15 +1291,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1892,189 +1318,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2095,15 +1349,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2124,15 +1378,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2153,114 +1407,29 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. SGD using GPU 0. -GetTrainCriterionNodes ... -GetEvalCriterionNodes ... + +Training criterion node(s): + ce = CrossEntropyWithSoftmax + +Evaluation criterion node(s): + err = ErrorPrediction -Allocating matrices for gradient computing +Allocating matrices for forward and/or backward propagation. No PreCompute nodes found, skipping PreCompute step Warning: checkpoint file is missing. learning parameters will be initialized from 0 Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 +Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting at epoch 11 counting lines to determine record count 1000 records found @@ -2268,49 +1437,27 @@ starting epoch 11 at record count 1100, and file position 100 reading from record 0 to 100 to be positioned properly for epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 11: 6, 31, ... - Epoch[12 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.03456436; EvalErr[0]PerSample = 0.02000000; TotalTime = 0.19100s; TotalTimePerSample = 1.90999ms; SamplesPerSecond = 523 -Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0345644; EvalErrPerSample = 0.02; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.223405 -CNTKCommandTrainEnd: Train +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 11: 6, 31, ... + Epoch[12 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.33976147; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.2483s; SamplesPerSecond = 402.8 +Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.33976147; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.283086 +CNTKCommandTrainEnd: train + +Post-processing network... + +3 roots: + ce = CrossEntropyWithSoftmax + outputNodes.z = Plus + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. - - -Printing Gradient Computation Node Order ... - -CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0]) -OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1]) -OutputNodes.b[10, 1] = LearnableParameter -OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0]) -h1.y[0, 0] = Sigmoid(h1.z[0, 0]) -h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1]) -h1.b[128, 1] = LearnableParameter -h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0]) -pool2[0, 0] = AveragePooling(conv2_act.act[0, 0]) -conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0]) -conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1]) -conv2_act.convB[32, 1] = LearnableParameter -conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0]) -pool1[0, 0] = MaxPooling(conv1_act.act[0, 0]) -conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0]) -conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1]) -conv1_act.convB[16, 1] = LearnableParameter -conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0]) -featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0]) -features[784, 0] = InputValue -featScale[1, 1] = LearnableParameter -conv1_act.convW[16, 25] = LearnableParameter -conv2_act.convW[32, 400] = LearnableParameter -h1.W[128, 512] = LearnableParameter -OutputNodes.W[10, 128] = LearnableParameter -labels[10, 0] = InputValue - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2331,15 +1478,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node ce. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2360,15 +1507,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2389,19 +1536,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2422,15 +1567,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2451,15 +1594,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2480,189 +1621,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2683,15 +1652,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2712,15 +1681,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2741,108 +1710,21 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. evalNodeNames are not specified, using all the default evalnodes and training criterion nodes. + + +Allocating matrices for forward and/or backward propagation. starting epoch 0 at record count 0, and file position 0 already there from last epoch -randomordering: 11 retries for 100 elements (11.0%) to ensure window condition -randomordering: recached sequence for seed 0: 15, 33, ... -Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.90504265 Perplexity = 2.4720373 +RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 15, 33, ... +Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30440025 Perplexity = 1.3558116 COMPLETED diff --git a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt index 81ab0c056..c34022c07 100644 --- a/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt +++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.windows.release.gpu.txt @@ -1,224 +1,245 @@ ------------------------------------------------------------------- Build info: - Built time: Nov 23 2015 09:55:26 - Last modified date: Mon Nov 23 09:45:21 2015 - Built by alexeyk on alexey-rz - Build Path: C:\src\cntk\MachineLearning\CNTK\ + Built time: Dec 18 2015 14:55:05 + Last modified date: Wed Dec 16 11:33:30 2015 CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + Build Branch: + Build SHA1: + Built by alexeyk on z840-01 + Build Path: C:\src\cntk\Source\CNTK\ ------------------------------------------------------------------- -running on alexey-rz at 2015/11/23 10:09:12 +running on z840-01 at 2015/12/18 14:58:21 command line: -C:\src\cntk\x64\Release\CNTK.exe configFile=C:\src\cntk\Tests\Image\QuickE2E\cntk.config RunDir=C:\src\cntk\Tests\Image\_run DataDir=C:\src\cntk\Tests\Image\Data ConfigDir=C:\src\cntk\Tests\Image\QuickE2E DeviceId=0 +C:\src\cntk\x64\Release\CNTK.exe configFile=QuickE2E\cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_out\gpu.txt >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=$DeviceId$ -ndlMacros=$ConfigDir$/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=$RunDir$/models/cntk.dnn - deviceId=$DeviceId$ - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = $DeviceId$ +ndlMacros = "$ConfigDir$/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "$RunDir$/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=$RunDir$/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl +test = [ + action = "test" + modelPath = "$RunDir$/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -RunDir=C:\src\cntk\Tests\Image\_run -DataDir=C:\src\cntk\Tests\Image\Data -ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_out\gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=0 -ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = Auto +ndlMacros = "QuickE2E/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl +test = [ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -RunDir=C:\src\cntk\Tests\Image\_run -DataDir=C:\src\cntk\Tests\Image\Data -ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_out\gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -configparameters: cntk.config:command=Train:Test -configparameters: cntk.config:ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -configparameters: cntk.config:DataDir=C:\src\cntk\Tests\Image\Data -configparameters: cntk.config:deviceId=0 -configparameters: cntk.config:ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl +configparameters: cntk.config:command=train:test +configparameters: cntk.config:ConfigDir=QuickE2E +configparameters: cntk.config:DataDir=Data +configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl +configparameters: cntk.config:numCPUThreads=8 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=C:\src\cntk\Tests\Image\_run -configparameters: cntk.config:Test=[ - action=test - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl +configparameters: cntk.config:RunDir=_out +configparameters: cntk.config:stderr=_out\gpu.txt +configparameters: cntk.config:test=[ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -configparameters: cntk.config:Train=[ - action=train - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +configparameters: cntk.config:train=[ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< -command: Train Test +command: train test precision = float -CNTKModelPath: C:\src\cntk\Tests\Image\_run/models/cntk.dnn -CNTKCommandTrainInfo: Train : 12 +Using 8 CPU threads +CNTKModelPath: _out/models/cntk.dnn +CNTKCommandTrainInfo: train : 12 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12 -CNTKCommandTrainBegin: Train +CNTKCommandTrainBegin: train +LockDevice: Locked GPU 0 to test availability. +LockDevice: Unlocked GPU 0 after testing. +LockDevice: Locked GPU 1 to test availability. +LockDevice: Unlocked GPU 1 after testing. +LockDevice: Locked GPU 2 to test availability. +LockDevice: Unlocked GPU 2 after testing. +LockDevice: Locked GPU 0 for exclusive use. NDLBuilder Using GPU 0 -reading uci file C:\src\cntk\Tests\Image\Data/Train.txt +Reading UCI file Data/Train.txt +Microsoft::MSR::CNTK::GPUMatrix::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 + +Post-processing network... + +3 roots: + err = ErrorPrediction + outputNodes.z = Plus + ce = CrossEntropyWithSoftmax +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation -Allocating matrices for forward propagation. - - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -227,27 +248,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node err. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -256,27 +277,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -285,31 +306,29 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -318,27 +337,25 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -347,27 +364,25 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -376,201 +391,29 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -579,27 +422,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -608,27 +451,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -637,126 +480,40 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - -SetUniformRandomValue (GPU): creating curand object with seed 1 SGD using GPU 0. -GetTrainCriterionNodes ... -GetEvalCriterionNodes ... + +Training criterion node(s): + ce = CrossEntropyWithSoftmax + +Evaluation criterion node(s): + err = ErrorPrediction -Allocating matrices for gradient computing +Allocating matrices for forward and/or backward propagation. No PreCompute nodes found, skipping PreCompute step Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.005000 effective momentum = 0.000000 +Starting Epoch 1: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting at epoch 0 counting lines to determine record count 1000 records found @@ -764,119 +521,125 @@ starting epoch 0 at record count 0, and file position 0 already there from last epoch Starting minibatch loop. -randomordering: 11 retries for 100 elements (11.0%) to ensure window condition -randomordering: recached sequence for seed 0: 15, 33, ... - Epoch[ 1 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.37891785; EvalErr[0]PerSample = 0.93000000; TotalTime = 0.88819s; TotalTimePerSample = 8.88193ms; SamplesPerSecond = 112 -Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3789177; EvalErrPerSample = 0.93000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=1.054592 -Starting Epoch 2: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 15, 33, ... + Epoch[ 1 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.34708694; EvalErr[0]PerSample = 0.92000000; TotalTime = 0.4657s; SamplesPerSecond = 214.7 +Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3470869; EvalErrPerSample = 0.91999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.484419 +Starting Epoch 2: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 1 at record count 100, and file position 100 already there from last epoch Starting minibatch loop. -randomordering: 26 retries for 100 elements (26.0%) to ensure window condition -randomordering: recached sequence for seed 1: 20, 26, ... - Epoch[ 2 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.37942505; EvalErr[0]PerSample = 0.91000000; TotalTime = 0.03505s; TotalTimePerSample = 0.35045ms; SamplesPerSecond = 2853 -Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.379425; EvalErrPerSample = 0.90999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035368 -Starting Epoch 3: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 26 retries for 100 elements (26.0%) to ensure window condition +RandomOrdering: recached sequence for seed 1: 20, 26, ... + Epoch[ 2 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.29444092; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.0227s; SamplesPerSecond = 4400.1 +Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.294441; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.024503 +Starting Epoch 3: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 2 at record count 200, and file position 200 already there from last epoch Starting minibatch loop. -randomordering: 28 retries for 100 elements (28.0%) to ensure window condition -randomordering: recached sequence for seed 2: 4, 35, ... - Epoch[ 3 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.32070969; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.03474s; TotalTimePerSample = 0.34742ms; SamplesPerSecond = 2878 -Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.3207097; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035036 -Starting Epoch 4: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 28 retries for 100 elements (28.0%) to ensure window condition +RandomOrdering: recached sequence for seed 2: 4, 35, ... + Epoch[ 3 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.13786591; EvalErr[0]PerSample = 0.73000000; TotalTime = 0.0224s; SamplesPerSecond = 4464.7 +Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.1378658; EvalErrPerSample = 0.72999996; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.024125 +Starting Epoch 4: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 3 at record count 300, and file position 300 already there from last epoch Starting minibatch loop. -randomordering: 17 retries for 100 elements (17.0%) to ensure window condition -randomordering: recached sequence for seed 3: 28, 7, ... - Epoch[ 4 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.36838959; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.03532s; TotalTimePerSample = 0.35322ms; SamplesPerSecond = 2831 -Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.3683896; EvalErrPerSample = 0.89999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.03561 -Starting Epoch 5: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition +RandomOrdering: recached sequence for seed 3: 28, 7, ... + Epoch[ 4 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.03929321; EvalErr[0]PerSample = 0.75000000; TotalTime = 0.0230s; SamplesPerSecond = 4355.8 +Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.0392931; EvalErrPerSample = 0.75; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.024759 +Starting Epoch 5: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 4 at record count 400, and file position 400 already there from last epoch Starting minibatch loop. -randomordering: 15 retries for 100 elements (15.0%) to ensure window condition -randomordering: recached sequence for seed 4: 5, 36, ... - Epoch[ 5 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.32354156; EvalErr[0]PerSample = 0.84000000; TotalTime = 0.03528s; TotalTimePerSample = 0.35281ms; SamplesPerSecond = 2834 -Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 2.3235414; EvalErrPerSample = 0.83999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035547 -Starting Epoch 6: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 15 retries for 100 elements (15.0%) to ensure window condition +RandomOrdering: recached sequence for seed 4: 5, 36, ... + Epoch[ 5 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.77985336; EvalErr[0]PerSample = 0.53000000; TotalTime = 0.0193s; SamplesPerSecond = 5174.4 +Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 1.7798533; EvalErrPerSample = 0.52999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020804 +Starting Epoch 6: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 5 at record count 500, and file position 500 already there from last epoch Starting minibatch loop. -randomordering: 13 retries for 100 elements (13.0%) to ensure window condition -randomordering: recached sequence for seed 5: 11, 48, ... - Epoch[ 6 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.24672409; EvalErr[0]PerSample = 0.83000000; TotalTime = 0.03495s; TotalTimePerSample = 0.34947ms; SamplesPerSecond = 2861 -Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 2.2467241; EvalErrPerSample = 0.82999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035271 -Starting Epoch 7: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition +RandomOrdering: recached sequence for seed 5: 11, 48, ... + Epoch[ 6 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.49362656; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.0194s; SamplesPerSecond = 5161.0 +Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 1.4936265; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020921 +Starting Epoch 7: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 6 at record count 600, and file position 600 already there from last epoch Starting minibatch loop. -randomordering: 13 retries for 100 elements (13.0%) to ensure window condition -randomordering: recached sequence for seed 6: 15, 3, ... - Epoch[ 7 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.09912888; EvalErr[0]PerSample = 0.69000000; TotalTime = 0.03487s; TotalTimePerSample = 0.34871ms; SamplesPerSecond = 2867 -Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 2.0991287; EvalErrPerSample = 0.69; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035159 -Starting Epoch 8: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition +RandomOrdering: recached sequence for seed 6: 15, 3, ... + Epoch[ 7 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.17570114; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.0207s; SamplesPerSecond = 4830.0 +Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 1.1757011; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.02243 +Starting Epoch 8: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 7 at record count 700, and file position 700 already there from last epoch Starting minibatch loop. -randomordering: 22 retries for 100 elements (22.0%) to ensure window condition -randomordering: recached sequence for seed 7: 9, 19, ... - Epoch[ 8 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.01871979; EvalErr[0]PerSample = 0.61000000; TotalTime = 0.03490s; TotalTimePerSample = 0.34905ms; SamplesPerSecond = 2864 -Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 2.0187197; EvalErrPerSample = 0.61000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035189 -Starting Epoch 9: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 22 retries for 100 elements (22.0%) to ensure window condition +RandomOrdering: recached sequence for seed 7: 9, 19, ... + Epoch[ 8 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.98662323; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.0202s; SamplesPerSecond = 4952.7 +Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 0.98662323; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.021894 +Starting Epoch 9: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 8 at record count 800, and file position 800 already there from last epoch Starting minibatch loop. -randomordering: 16 retries for 100 elements (16.0%) to ensure window condition -randomordering: recached sequence for seed 8: 8, 5, ... - Epoch[ 9 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.75549896; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.03488s; TotalTimePerSample = 0.34884ms; SamplesPerSecond = 2866 -Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 1.7554989; EvalErrPerSample = 0.34999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.03521 -Starting Epoch 10: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 16 retries for 100 elements (16.0%) to ensure window condition +RandomOrdering: recached sequence for seed 8: 8, 5, ... + Epoch[ 9 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.72003899; EvalErr[0]PerSample = 0.01000000; TotalTime = 0.0202s; SamplesPerSecond = 4960.1 +Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 0.72003895; EvalErrPerSample = 0.0099999998; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.021856 +Starting Epoch 10: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 9 at record count 900, and file position 900 already there from last epoch Starting minibatch loop. -randomordering: 16 retries for 100 elements (16.0%) to ensure window condition -randomordering: recached sequence for seed 9: 7, 10, ... - Epoch[10 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.64107086; EvalErr[0]PerSample = 0.39000000; TotalTime = 0.03478s; TotalTimePerSample = 0.34779ms; SamplesPerSecond = 2875 -Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 1.6410708; EvalErrPerSample = 0.38999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035064 -Starting Epoch 11: learning rate per sample = 0.005000 effective momentum = 0.700000 +RandomOrdering: 16 retries for 100 elements (16.0%) to ensure window condition +RandomOrdering: recached sequence for seed 9: 7, 10, ... + Epoch[10 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.60043072; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0222s; SamplesPerSecond = 4494.4 +Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 0.60043073; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.023996 +Starting Epoch 11: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting epoch 10 at record count 1000, and file position 0 already there from last epoch Starting minibatch loop. -randomordering: 22 retries for 100 elements (22.0%) to ensure window condition -randomordering: recached sequence for seed 10: 13, 22, ... - Epoch[11 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.30029129; EvalErr[0]PerSample = 0.12000000; TotalTime = 0.03496s; TotalTimePerSample = 0.34960ms; SamplesPerSecond = 2860 -Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 1.3002913; EvalErrPerSample = 0.12; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035526 -Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 +RandomOrdering: 22 retries for 100 elements (22.0%) to ensure window condition +RandomOrdering: recached sequence for seed 10: 13, 22, ... + Epoch[11 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.42560429; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0216s; SamplesPerSecond = 4639.5 +Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 0.42560428; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.023399 +Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting epoch 11 at record count 1100, and file position 100 already there from last epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 11: 6, 31, ... - Epoch[12 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.01696381; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.03480s; TotalTimePerSample = 0.34798ms; SamplesPerSecond = 2873 -Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0169638; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.035119 -CNTKCommandTrainEnd: Train +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 11: 6, 31, ... + Epoch[12 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.33292500; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0197s; SamplesPerSecond = 5079.5 +Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.33292499; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.021406 +CNTKCommandTrainEnd: train + +Post-processing network... + +3 roots: + outputNodes.z = Plus + ce = CrossEntropyWithSoftmax + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. +Validating for node outputNodes.z. 24 nodes to process in pass 1. - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -897,15 +660,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node outputNodes.z. 14 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -926,15 +687,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -955,280 +714,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node CE. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node CE, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1249,15 +745,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1278,15 +774,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1307,19 +803,18 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1340,15 +835,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1369,15 +864,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1398,244 +893,268 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. evalNodeNames are not specified, using all the default evalnodes and training criterion nodes. + + +Allocating matrices for forward and/or backward propagation. starting epoch 0 at record count 0, and file position 0 already there from last epoch -randomordering: 11 retries for 100 elements (11.0%) to ensure window condition -randomordering: recached sequence for seed 0: 15, 33, ... -Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.87062637 Perplexity = 2.3884064 +RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 15, 33, ... +Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.29111847 Perplexity = 1.3379231 COMPLETED === Deleting last epoch data ==== Re-running from checkpoint ------------------------------------------------------------------- Build info: - Built time: Nov 23 2015 09:55:26 - Last modified date: Mon Nov 23 09:45:21 2015 - Built by alexeyk on alexey-rz - Build Path: C:\src\cntk\MachineLearning\CNTK\ - CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + Built time: Dec 18 2015 14:55:05 + Last modified date: Wed Dec 16 11:33:30 2015 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0 + Build Branch: + Build SHA1: + Built by alexeyk on z840-01 + Build Path: C:\src\cntk\Source\CNTK\ ------------------------------------------------------------------- -running on alexey-rz at 2015/11/23 10:32:35 +running on z840-01 at 2015/12/18 15:06:14 command line: -C:\src\cntk\x64\Release\CNTK.exe configFile=C:\src\cntk\Tests\Image\QuickE2E\cntk.config RunDir=C:\src\cntk\Tests\Image\_run DataDir=C:\src\cntk\Tests\Image\Data ConfigDir=C:\src\cntk\Tests\Image\QuickE2E DeviceId=0 +C:\src\cntk\x64\Release\CNTK.exe configFile=QuickE2E\cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_out\gpu.txt >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=$DeviceId$ -ndlMacros=$ConfigDir$/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=$RunDir$/models/cntk.dnn - deviceId=$DeviceId$ - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = $DeviceId$ +ndlMacros = "$ConfigDir$/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "$RunDir$/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=$RunDir$/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl +test = [ + action = "test" + modelPath = "$RunDir$/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -RunDir=C:\src\cntk\Tests\Image\_run -DataDir=C:\src\cntk\Tests\Image\Data -ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_out\gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=0 -ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = Auto +ndlMacros = "QuickE2E/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl +test = [ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -RunDir=C:\src\cntk\Tests\Image\_run -DataDir=C:\src\cntk\Tests\Image\Data -ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_out\gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -configparameters: cntk.config:command=Train:Test -configparameters: cntk.config:ConfigDir=C:\src\cntk\Tests\Image\QuickE2E -configparameters: cntk.config:DataDir=C:\src\cntk\Tests\Image\Data -configparameters: cntk.config:deviceId=0 -configparameters: cntk.config:ndlMacros=C:\src\cntk\Tests\Image\QuickE2E/Macros.ndl +configparameters: cntk.config:command=train:test +configparameters: cntk.config:ConfigDir=QuickE2E +configparameters: cntk.config:DataDir=Data +configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl +configparameters: cntk.config:numCPUThreads=8 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=C:\src\cntk\Tests\Image\_run -configparameters: cntk.config:Test=[ - action=test - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl +configparameters: cntk.config:RunDir=_out +configparameters: cntk.config:stderr=_out\gpu.txt +configparameters: cntk.config:test=[ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -configparameters: cntk.config:Train=[ - action=train - modelPath=C:\src\cntk\Tests\Image\_run/models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=C:\src\cntk\Tests\Image\QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +configparameters: cntk.config:train=[ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=C:\src\cntk\Tests\Image\Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=C:\src\cntk\Tests\Image\Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< -command: Train Test +command: train test precision = float -CNTKModelPath: C:\src\cntk\Tests\Image\_run/models/cntk.dnn -CNTKCommandTrainInfo: Train : 12 +Using 8 CPU threads +CNTKModelPath: _out/models/cntk.dnn +CNTKCommandTrainInfo: train : 12 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12 -CNTKCommandTrainBegin: Train +CNTKCommandTrainBegin: train +LockDevice: Locked GPU 0 to test availability. +LockDevice: Unlocked GPU 0 after testing. +LockDevice: Locked GPU 1 to test availability. +LockDevice: Unlocked GPU 1 after testing. +LockDevice: Locked GPU 2 to test availability. +LockDevice: Unlocked GPU 2 after testing. +LockDevice: Locked GPU 0 for exclusive use. NDLBuilder Using GPU 0 -reading uci file C:\src\cntk\Tests\Image\Data/Train.txt -Starting from checkpoint. Load Network From File C:\src\cntk\Tests\Image\_run/models/cntk.dnn.11. +Reading UCI file Data/Train.txt +Starting from checkpoint. Load Network From File _out/models/cntk.dnn.11. + +Post-processing network... + +3 roots: + ce = CrossEntropyWithSoftmax + outputNodes.z = Plus + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. - - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1656,15 +1175,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node ce. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1685,15 +1204,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1714,19 +1233,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1747,15 +1264,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1776,15 +1291,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1805,189 +1318,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2008,15 +1349,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2037,15 +1378,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2066,114 +1407,29 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. SGD using GPU 0. -GetTrainCriterionNodes ... -GetEvalCriterionNodes ... + +Training criterion node(s): + ce = CrossEntropyWithSoftmax + +Evaluation criterion node(s): + err = ErrorPrediction -Allocating matrices for gradient computing +Allocating matrices for forward and/or backward propagation. No PreCompute nodes found, skipping PreCompute step Warning: checkpoint file is missing. learning parameters will be initialized from 0 Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 +Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting at epoch 11 counting lines to determine record count 1000 records found @@ -2181,20 +1437,26 @@ starting epoch 11 at record count 1100, and file position 100 reading from record 0 to 100 to be positioned properly for epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 11: 6, 31, ... - Epoch[12 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.03456436; EvalErr[0]PerSample = 0.02000000; TotalTime = 0.15455s; TotalTimePerSample = 1.54549ms; SamplesPerSecond = 647 -Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0345644; EvalErrPerSample = 0.02; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.176761 -CNTKCommandTrainEnd: Train +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 11: 6, 31, ... + Epoch[12 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.33976151; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.7157s; SamplesPerSecond = 139.7 +Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.3397615; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.736517 +CNTKCommandTrainEnd: train + +Post-processing network... + +3 roots: + outputNodes.z = Plus + ce = CrossEntropyWithSoftmax + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. +Validating for node outputNodes.z. 24 nodes to process in pass 1. - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2215,15 +1477,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node outputNodes.z. 14 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2244,15 +1504,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2273,280 +1531,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node CE. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node CE, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2567,15 +1562,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2596,15 +1591,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2625,19 +1620,18 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2658,15 +1652,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2687,15 +1681,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2716,17 +1710,21 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. evalNodeNames are not specified, using all the default evalnodes and training criterion nodes. + + +Allocating matrices for forward and/or backward propagation. starting epoch 0 at record count 0, and file position 0 already there from last epoch -randomordering: 11 retries for 100 elements (11.0%) to ensure window condition -randomordering: recached sequence for seed 0: 15, 33, ... -Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.90504265 Perplexity = 2.4720373 +RandomOrdering: 11 retries for 100 elements (11.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 15, 33, ... +Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30440022 Perplexity = 1.3558116 COMPLETED From ef80d86dedac4b21b58970bfe05c343f8398028f Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Fri, 18 Dec 2015 23:41:59 +0000 Subject: [PATCH 19/19] Updated Linux baselines. --- .../QuickE2E/baseline.linux.debug.gpu.txt | 2679 +++++------------ .../QuickE2E/baseline.linux.release.gpu.txt | 2566 +++++----------- 2 files changed, 1589 insertions(+), 3656 deletions(-) diff --git a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt index ced418fd8..3c5b6f82d 100644 --- a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt +++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.debug.gpu.txt @@ -1,244 +1,247 @@ -running on localhost at 2015/11/23 11:42:03 +------------------------------------------------------------------- +Build info: + + Built time: Dec 18 2015 23:32:02 + Last modified date: Fri Dec 18 23:24:08 2015 + Build type: release + Math lib: acml + CUDA_PATH: /usr/local/cuda-7.0 + CUB_PATH: /usr/local/cub-1.4.1 + Build Branch: master + Build SHA1: f675c24ad6e803523212d772c27ae2c2c98b6ce9 +------------------------------------------------------------------- +running on localhost at 2015/12/18 23:38:54 command line: -/home/alexey/Projects/cntk/bin/cntk configFile=./QuickE2E/cntk.config DataDir=./Data RunDir=. ConfigDir=./QuickE2E DeviceId=0 +../../../bin/cntk configFile=QuickE2E/cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=gpu.txt >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=$DeviceId$ -ndlMacros=$ConfigDir$/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=$RunDir$/models/cntk.dnn - deviceId=$DeviceId$ - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = $DeviceId$ +ndlMacros = "$ConfigDir$/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "$RunDir$/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=$RunDir$/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl +test = [ + action = "test" + modelPath = "$RunDir$/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -DataDir=./Data -RunDir=. -ConfigDir=./QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=0 -ndlMacros=./QuickE2E/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=./models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = Auto +ndlMacros = "QuickE2E/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=./models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl +test = [ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -DataDir=./Data -RunDir=. -ConfigDir=./QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -configparameters: cntk.config:command=Train:Test -configparameters: cntk.config:ConfigDir=./QuickE2E -configparameters: cntk.config:DataDir=./Data -configparameters: cntk.config:deviceId=0 -configparameters: cntk.config:ndlMacros=./QuickE2E/Macros.ndl +configparameters: cntk.config:command=train:test +configparameters: cntk.config:ConfigDir=QuickE2E +configparameters: cntk.config:DataDir=Data +configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl +configparameters: cntk.config:numCPUThreads=8 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=. -configparameters: cntk.config:Test=[ - action=test - modelPath=./models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl +configparameters: cntk.config:RunDir=_out +configparameters: cntk.config:stderr=gpu.txt +configparameters: cntk.config:test=[ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -configparameters: cntk.config:Train=[ - action=train - modelPath=./models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +configparameters: cntk.config:train=[ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< -command: Train Test +command: train test precision = float -CNTKModelPath: ./models/cntk.dnn -CNTKCommandTrainInfo: Train : 12 +Using 8 CPU threads +CNTKModelPath: _out/models/cntk.dnn +CNTKCommandTrainInfo: train : 12 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12 -CNTKCommandTrainBegin: Train +CNTKCommandTrainBegin: train +LockDevice: Locked GPU 0 to test availability. +LockDevice: Unlocked GPU 0 after testing. +LockDevice: Locked GPU 1 to test availability. +LockDevice: Unlocked GPU 1 after testing. +LockDevice: Locked GPU 2 to test availability. +LockDevice: Unlocked GPU 2 after testing. +LockDevice: Locked GPU 3 to test availability. +LockDevice: Unlocked GPU 3 after testing. +LockDevice: Locked GPU 0 for exclusive use. NDLBuilder Using GPU 0 -reading uci file ./Data/Train.txt +Reading UCI file Data/Train.txt +SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 + +Post-processing network... + +3 roots: + outputNodes.z = Plus + ce = CrossEntropyWithSoftmax + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. +Validating for node outputNodes.z. 24 nodes to process in pass 1. - -Printing Gradient Computation Node Order ... - -CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 1], OutputNodes.z[0, 0]) -OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1]) -OutputNodes.b[10, 1] = LearnableParameter -OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0]) -h1.y[0, 0] = Sigmoid(h1.z[0, 0]) -h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1]) -h1.b[128, 1] = LearnableParameter -h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0]) -pool2[0, 0] = AveragePooling(conv2_act.act[0, 0]) -conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0]) -conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1]) -conv2_act.convB[32, 1] = LearnableParameter -conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0]) -pool1[0, 0] = MaxPooling(conv1_act.act[0, 0]) -conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0]) -conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1]) -conv1_act.convB[16, 1] = LearnableParameter -conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0]) -featScaled[0, 0] = Scale(featScale[1, 1], features[784, 1]) -features[784, 1] = InputValue -featScale[1, 1] = LearnableParameter -conv1_act.convW[16, 25] = LearnableParameter -conv2_act.convW[32, 400] = LearnableParameter -h1.W[128, 512] = LearnableParameter -OutputNodes.W[10, 128] = LearnableParameter -labels[10, 1] = InputValue - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -247,27 +250,25 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node outputNodes.z. 14 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -276,27 +277,25 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -305,292 +304,29 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node CE. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node CE, final verification. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -599,27 +335,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -628,27 +364,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -657,31 +393,30 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -690,27 +425,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -719,27 +454,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -748,35 +483,40 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. -SetUniformRandomValue (GPU): creating curand object with seed 1 +Post-processing network complete. + SGD using GPU 0. -GetTrainCriterionNodes ... -GetEvalCriterionNodes ... + +Training criterion node(s): + ce = CrossEntropyWithSoftmax + +Evaluation criterion node(s): + err = ErrorPrediction -Allocating matrices for gradient computing +Allocating matrices for forward and/or backward propagation. No PreCompute nodes found, skipping PreCompute step Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.005000 effective momentum = 0.000000 +Starting Epoch 1: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting at epoch 0 counting lines to determine record count 1000 records found @@ -784,148 +524,126 @@ starting epoch 0 at record count 0, and file position 0 already there from last epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 0: 38, 46, ... - Epoch[ 1 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.41911163; EvalErr[0]PerSample = 0.92000000; TotalTime = 0.53526s; TotalTimePerSample = 5.35259ms; SamplesPerSecond = 186 -Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.4191115; EvalErrPerSample = 0.91999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.558449 -Starting Epoch 2: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 38, 46, ... + Epoch[ 1 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.39150986; EvalErr[0]PerSample = 0.94000000; TotalTime = 0.1702s; SamplesPerSecond = 587.5 +Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3915098; EvalErrPerSample = 0.94; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.191305 +Starting Epoch 2: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 1 at record count 100, and file position 100 already there from last epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 1: 38, 46, ... - Epoch[ 2 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.38765198; EvalErr[0]PerSample = 0.89000000; TotalTime = 0.06055s; TotalTimePerSample = 0.60545ms; SamplesPerSecond = 1651 -Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.3876519; EvalErrPerSample = 0.88999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.060761 -Starting Epoch 3: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 1: 38, 46, ... + Epoch[ 2 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.29544342; EvalErr[0]PerSample = 0.87000000; TotalTime = 0.0532s; SamplesPerSecond = 1878.2 +Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.2954433; EvalErrPerSample = 0.87; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.053534 +Starting Epoch 3: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 2 at record count 200, and file position 200 already there from last epoch Starting minibatch loop. -randomordering: 30 retries for 100 elements (30.0%) to ensure window condition -randomordering: recached sequence for seed 2: 34, 6, ... - Epoch[ 3 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.30177277; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.06050s; TotalTimePerSample = 0.60495ms; SamplesPerSecond = 1653 -Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.3017728; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.060688 -Starting Epoch 4: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 30 retries for 100 elements (30.0%) to ensure window condition +RandomOrdering: recached sequence for seed 2: 34, 6, ... + Epoch[ 3 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.11703644; EvalErr[0]PerSample = 0.69000000; TotalTime = 0.0535s; SamplesPerSecond = 1870.5 +Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.1170363; EvalErrPerSample = 0.69; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.053674 +Starting Epoch 4: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 3 at record count 300, and file position 300 already there from last epoch Starting minibatch loop. -randomordering: 14 retries for 100 elements (14.0%) to ensure window condition -randomordering: recached sequence for seed 3: 35, 34, ... - Epoch[ 4 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.33002518; EvalErr[0]PerSample = 0.89000000; TotalTime = 0.05966s; TotalTimePerSample = 0.59664ms; SamplesPerSecond = 1676 -Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.3300252; EvalErrPerSample = 0.88999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.059867 -Starting Epoch 5: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition +RandomOrdering: recached sequence for seed 3: 35, 34, ... + Epoch[ 4 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.99407211; EvalErr[0]PerSample = 0.65000000; TotalTime = 0.0541s; SamplesPerSecond = 1847.6 +Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 1.9940721; EvalErrPerSample = 0.64999998; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.05433 +Starting Epoch 5: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 4 at record count 400, and file position 400 already there from last epoch Starting minibatch loop. -randomordering: 13 retries for 100 elements (13.0%) to ensure window condition -randomordering: recached sequence for seed 4: 30, 23, ... - Epoch[ 5 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.23725708; EvalErr[0]PerSample = 0.88000000; TotalTime = 0.05858s; TotalTimePerSample = 0.58577ms; SamplesPerSecond = 1707 -Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 2.237257; EvalErrPerSample = 0.88; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.058768 -Starting Epoch 6: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition +RandomOrdering: recached sequence for seed 4: 30, 23, ... + Epoch[ 5 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.72756714; EvalErr[0]PerSample = 0.45000000; TotalTime = 0.0555s; SamplesPerSecond = 1801.4 +Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 1.7275671; EvalErrPerSample = 0.44999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.055725 +Starting Epoch 6: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 5 at record count 500, and file position 500 already there from last epoch Starting minibatch loop. -randomordering: 25 retries for 100 elements (25.0%) to ensure window condition -randomordering: recached sequence for seed 5: 33, 43, ... - Epoch[ 6 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.24089386; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.05882s; TotalTimePerSample = 0.58824ms; SamplesPerSecond = 1699 -Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 2.2408938; EvalErrPerSample = 0.89999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.059015 -Starting Epoch 7: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 25 retries for 100 elements (25.0%) to ensure window condition +RandomOrdering: recached sequence for seed 5: 33, 43, ... + Epoch[ 6 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.51963303; EvalErr[0]PerSample = 0.21000000; TotalTime = 0.0539s; SamplesPerSecond = 1854.3 +Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 1.5196329; EvalErrPerSample = 0.20999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054135 +Starting Epoch 7: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 6 at record count 600, and file position 600 already there from last epoch Starting minibatch loop. -randomordering: 14 retries for 100 elements (14.0%) to ensure window condition -randomordering: recached sequence for seed 6: 12, 17, ... - Epoch[ 7 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.15189026; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.05783s; TotalTimePerSample = 0.57827ms; SamplesPerSecond = 1729 -Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 2.1518903; EvalErrPerSample = 0.79999995; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.058039 -Starting Epoch 8: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition +RandomOrdering: recached sequence for seed 6: 12, 17, ... + Epoch[ 7 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.29057953; EvalErr[0]PerSample = 0.20000000; TotalTime = 0.0548s; SamplesPerSecond = 1823.9 +Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 1.2905796; EvalErrPerSample = 0.19999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.055041 +Starting Epoch 8: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 7 at record count 700, and file position 700 already there from last epoch Starting minibatch loop. -randomordering: 14 retries for 100 elements (14.0%) to ensure window condition -randomordering: recached sequence for seed 7: 40, 7, ... - Epoch[ 8 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.02036377; EvalErr[0]PerSample = 0.68000000; TotalTime = 0.05703s; TotalTimePerSample = 0.57030ms; SamplesPerSecond = 1753 -Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 2.0203638; EvalErrPerSample = 0.68000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.057228 -Starting Epoch 9: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition +RandomOrdering: recached sequence for seed 7: 40, 7, ... + Epoch[ 8 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.97354042; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.0543s; SamplesPerSecond = 1841.6 +Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 0.97354043; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054515 +Starting Epoch 9: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 8 at record count 800, and file position 800 already there from last epoch Starting minibatch loop. -randomordering: 17 retries for 100 elements (17.0%) to ensure window condition -randomordering: recached sequence for seed 8: 8, 48, ... - Epoch[ 9 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.74879242; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.05806s; TotalTimePerSample = 0.58065ms; SamplesPerSecond = 1722 -Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 1.7487924; EvalErrPerSample = 0.44; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.058275 -Starting Epoch 10: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition +RandomOrdering: recached sequence for seed 8: 8, 48, ... + Epoch[ 9 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.73900383; EvalErr[0]PerSample = 0.03000000; TotalTime = 0.0544s; SamplesPerSecond = 1837.7 +Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 0.73900384; EvalErrPerSample = 0.029999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054655 +Starting Epoch 10: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 9 at record count 900, and file position 900 already there from last epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 9: 14, 26, ... - Epoch[10 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.56006454; EvalErr[0]PerSample = 0.18000000; TotalTime = 0.05721s; TotalTimePerSample = 0.57207ms; SamplesPerSecond = 1748 -Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 1.5600646; EvalErrPerSample = 0.17999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.057409 -Starting Epoch 11: learning rate per sample = 0.005000 effective momentum = 0.700000 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 9: 14, 26, ... + Epoch[10 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.57405857; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0542s; SamplesPerSecond = 1846.4 +Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 0.57405853; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054379 +Starting Epoch 11: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting epoch 10 at record count 1000, and file position 0 already there from last epoch Starting minibatch loop. -randomordering: 31 retries for 100 elements (31.0%) to ensure window condition -randomordering: recached sequence for seed 10: 22, 4, ... - Epoch[11 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.32553162; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.05778s; TotalTimePerSample = 0.57785ms; SamplesPerSecond = 1730 -Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 1.3255316; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.058009 -Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 +RandomOrdering: 31 retries for 100 elements (31.0%) to ensure window condition +RandomOrdering: recached sequence for seed 10: 22, 4, ... + Epoch[11 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.45112953; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0591s; SamplesPerSecond = 1690.7 +Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 0.45112953; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.05945 +Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting epoch 11 at record count 1100, and file position 100 already there from last epoch Starting minibatch loop. -randomordering: 17 retries for 100 elements (17.0%) to ensure window condition -randomordering: recached sequence for seed 11: 2, 40, ... - Epoch[12 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.01003433; EvalErr[0]PerSample = 0.03000000; TotalTime = 0.05721s; TotalTimePerSample = 0.57209ms; SamplesPerSecond = 1747 -Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0100343; EvalErrPerSample = 0.029999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.057407 -CNTKCommandTrainEnd: Train +RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition +RandomOrdering: recached sequence for seed 11: 2, 40, ... + Epoch[12 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.34545708; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0540s; SamplesPerSecond = 1851.0 +Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.34545708; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.054291 +CNTKCommandTrainEnd: train + +Post-processing network... + +3 roots: + ce = CrossEntropyWithSoftmax + err = ErrorPrediction + outputNodes.z = Plus +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation -Allocating matrices for forward propagation. - - -Printing Gradient Computation Node Order ... - -CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0]) -OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1]) -OutputNodes.b[10, 1] = LearnableParameter -OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0]) -h1.y[0, 0] = Sigmoid(h1.z[0, 0]) -h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1]) -h1.b[128, 1] = LearnableParameter -h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0]) -pool2[0, 0] = AveragePooling(conv2_act.act[0, 0]) -conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0]) -conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1]) -conv2_act.convB[32, 1] = LearnableParameter -conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0]) -pool1[0, 0] = MaxPooling(conv1_act.act[0, 0]) -conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0]) -conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1]) -conv1_act.convB[16, 1] = LearnableParameter -conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0]) -featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0]) -features[784, 0] = InputValue -featScale[1, 1] = LearnableParameter -conv1_act.convW[16, 25] = LearnableParameter -conv2_act.convW[32, 400] = LearnableParameter -h1.W[128, 512] = LearnableParameter -OutputNodes.W[10, 128] = LearnableParameter -labels[10, 0] = InputValue - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -946,15 +664,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node ce. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -975,15 +693,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1004,19 +722,18 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1037,15 +754,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1066,15 +783,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1095,18 +812,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1127,13 +843,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node OutputNodes.z. 13 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1154,13 +870,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node OutputNodes.z, final verification. +Validating for node outputNodes.z, final verification. -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1181,530 +897,271 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - +Post-processing network complete. evalNodeNames are not specified, using all the default evalnodes and training criterion nodes. + + +Allocating matrices for forward and/or backward propagation. starting epoch 0 at record count 0, and file position 0 already there from last epoch -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 0: 38, 46, ... -Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.84035759 Perplexity = 2.3171954 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 38, 46, ... +Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30271576 Perplexity = 1.3535297 COMPLETED === Deleting last epoch data ==== Re-running from checkpoint -running on localhost at 2015/11/23 11:43:57 + +------------------------------------------------------------------- +Build info: + + Built time: Dec 18 2015 23:32:02 + Last modified date: Fri Dec 18 23:24:08 2015 + Build type: release + Math lib: acml + CUDA_PATH: /usr/local/cuda-7.0 + CUB_PATH: /usr/local/cub-1.4.1 + Build Branch: master + Build SHA1: f675c24ad6e803523212d772c27ae2c2c98b6ce9 +------------------------------------------------------------------- +running on localhost at 2015/12/18 23:41:15 command line: -/home/alexey/Projects/cntk/bin/cntk configFile=./QuickE2E/cntk.config DataDir=./Data RunDir=. ConfigDir=./QuickE2E DeviceId=0 +../../../bin/cntk configFile=QuickE2E/cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=gpu.txt >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=$DeviceId$ -ndlMacros=$ConfigDir$/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=$RunDir$/models/cntk.dnn - deviceId=$DeviceId$ - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = $DeviceId$ +ndlMacros = "$ConfigDir$/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "$RunDir$/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=$RunDir$/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl +test = [ + action = "test" + modelPath = "$RunDir$/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -DataDir=./Data -RunDir=. -ConfigDir=./QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=0 -ndlMacros=./QuickE2E/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=./models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = Auto +ndlMacros = "QuickE2E/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=./models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl +test = [ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -DataDir=./Data -RunDir=. -ConfigDir=./QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -configparameters: cntk.config:command=Train:Test -configparameters: cntk.config:ConfigDir=./QuickE2E -configparameters: cntk.config:DataDir=./Data -configparameters: cntk.config:deviceId=0 -configparameters: cntk.config:ndlMacros=./QuickE2E/Macros.ndl +configparameters: cntk.config:command=train:test +configparameters: cntk.config:ConfigDir=QuickE2E +configparameters: cntk.config:DataDir=Data +configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl +configparameters: cntk.config:numCPUThreads=8 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=. -configparameters: cntk.config:Test=[ - action=test - modelPath=./models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl +configparameters: cntk.config:RunDir=_out +configparameters: cntk.config:stderr=gpu.txt +configparameters: cntk.config:test=[ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -configparameters: cntk.config:Train=[ - action=train - modelPath=./models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +configparameters: cntk.config:train=[ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< -command: Train Test +command: train test precision = float -CNTKModelPath: ./models/cntk.dnn -CNTKCommandTrainInfo: Train : 12 +Using 8 CPU threads +CNTKModelPath: _out/models/cntk.dnn +CNTKCommandTrainInfo: train : 12 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12 -CNTKCommandTrainBegin: Train +CNTKCommandTrainBegin: train +LockDevice: Locked GPU 0 to test availability. +LockDevice: Unlocked GPU 0 after testing. +LockDevice: Locked GPU 1 to test availability. +LockDevice: Unlocked GPU 1 after testing. +LockDevice: Locked GPU 2 to test availability. +LockDevice: Unlocked GPU 2 after testing. +LockDevice: Locked GPU 3 to test availability. +LockDevice: Unlocked GPU 3 after testing. +LockDevice: Locked GPU 0 for exclusive use. NDLBuilder Using GPU 0 -reading uci file ./Data/Train.txt -Starting from checkpoint. Load Network From File ./models/cntk.dnn.11. +Reading UCI file Data/Train.txt +Starting from checkpoint. Load Network From File _out/models/cntk.dnn.11. + +Post-processing network... + +3 roots: + err = ErrorPrediction + outputNodes.z = Plus + ce = CrossEntropyWithSoftmax +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation -Allocating matrices for forward propagation. - - -Printing Gradient Computation Node Order ... - -CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0]) -OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1]) -OutputNodes.b[10, 1] = LearnableParameter -OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0]) -h1.y[0, 0] = Sigmoid(h1.z[0, 0]) -h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1]) -h1.b[128, 1] = LearnableParameter -h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0]) -pool2[0, 0] = AveragePooling(conv2_act.act[0, 0]) -conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0]) -conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1]) -conv2_act.convB[32, 1] = LearnableParameter -conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0]) -pool1[0, 0] = MaxPooling(conv1_act.act[0, 0]) -conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0]) -conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1]) -conv1_act.convB[16, 1] = LearnableParameter -conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0]) -featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0]) -features[784, 0] = InputValue -featScale[1, 1] = LearnableParameter -conv1_act.convW[16, 25] = LearnableParameter -conv2_act.convW[32, 400] = LearnableParameter -h1.W[128, 512] = LearnableParameter -OutputNodes.W[10, 128] = LearnableParameter -labels[10, 0] = InputValue - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1725,15 +1182,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node err. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1754,15 +1211,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1783,19 +1240,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1816,15 +1271,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1845,15 +1298,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1874,189 +1325,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2077,15 +1356,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2106,15 +1385,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2135,114 +1414,29 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. SGD using GPU 0. -GetTrainCriterionNodes ... -GetEvalCriterionNodes ... + +Training criterion node(s): + ce = CrossEntropyWithSoftmax + +Evaluation criterion node(s): + err = ErrorPrediction -Allocating matrices for gradient computing +Allocating matrices for forward and/or backward propagation. No PreCompute nodes found, skipping PreCompute step Warning: checkpoint file is missing. learning parameters will be initialized from 0 Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 +Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting at epoch 11 counting lines to determine record count 1000 records found @@ -2250,49 +1444,26 @@ starting epoch 11 at record count 1100, and file position 100 reading from record 0 to 100 to be positioned properly for epoch Starting minibatch loop. -randomordering: 17 retries for 100 elements (17.0%) to ensure window condition -randomordering: recached sequence for seed 11: 2, 40, ... - Epoch[12 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.01463303; EvalErr[0]PerSample = 0.02000000; TotalTime = 0.12786s; TotalTimePerSample = 1.27864ms; SamplesPerSecond = 782 -Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0146331; EvalErrPerSample = 0.02; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.152772 -CNTKCommandTrainEnd: Train +RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition +RandomOrdering: recached sequence for seed 11: 2, 40, ... + Epoch[12 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.34671265; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.1653s; SamplesPerSecond = 604.8 +Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.34671265; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.188096 +CNTKCommandTrainEnd: train + +Post-processing network... + +3 roots: + outputNodes.z = Plus + err = ErrorPrediction + ce = CrossEntropyWithSoftmax +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation -Allocating matrices for forward propagation. +Validating for node outputNodes.z. 24 nodes to process in pass 1. - -Printing Gradient Computation Node Order ... - -CE[0, 0] = CrossEntropyWithSoftmax(labels[10, 0], OutputNodes.z[0, 0]) -OutputNodes.z[0, 0] = Plus(OutputNodes.t[0, 0], OutputNodes.b[10, 1]) -OutputNodes.b[10, 1] = LearnableParameter -OutputNodes.t[0, 0] = Times(OutputNodes.W[10, 128], h1.y[0, 0]) -h1.y[0, 0] = Sigmoid(h1.z[0, 0]) -h1.z[0, 0] = Plus(h1.t[0, 0], h1.b[128, 1]) -h1.b[128, 1] = LearnableParameter -h1.t[0, 0] = Times(h1.W[128, 512], pool2[0, 0]) -pool2[0, 0] = AveragePooling(conv2_act.act[0, 0]) -conv2_act.act[0, 0] = RectifiedLinear(conv2_act.convPlusB[0, 0]) -conv2_act.convPlusB[0, 0] = Plus(conv2_act.conv[0, 0], conv2_act.convB[32, 1]) -conv2_act.convB[32, 1] = LearnableParameter -conv2_act.conv[0, 0] = Convolution(conv2_act.convW[32, 400], pool1[0, 0]) -pool1[0, 0] = MaxPooling(conv1_act.act[0, 0]) -conv1_act.act[0, 0] = RectifiedLinear(conv1_act.convPlusB[0, 0]) -conv1_act.convPlusB[0, 0] = Plus(conv1_act.conv[0, 0], conv1_act.convB[16, 1]) -conv1_act.convB[16, 1] = LearnableParameter -conv1_act.conv[0, 0] = Convolution(conv1_act.convW[16, 25], featScaled[0, 0]) -featScaled[0, 0] = Scale(featScale[1, 1], features[784, 0]) -features[784, 0] = InputValue -featScale[1, 1] = LearnableParameter -conv1_act.convW[16, 25] = LearnableParameter -conv2_act.convW[32, 400] = LearnableParameter -h1.W[128, 512] = LearnableParameter -OutputNodes.W[10, 128] = LearnableParameter -labels[10, 0] = InputValue - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2313,15 +1484,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node outputNodes.z. 14 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2342,15 +1511,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2371,280 +1538,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node CE. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node CE, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2665,15 +1569,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2694,15 +1598,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2723,19 +1627,18 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2756,15 +1659,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2785,15 +1688,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2814,17 +1717,21 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. evalNodeNames are not specified, using all the default evalnodes and training criterion nodes. + + +Allocating matrices for forward and/or backward propagation. starting epoch 0 at record count 0, and file position 0 already there from last epoch -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 0: 38, 46, ... -Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.88667282 Perplexity = 2.427041 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 38, 46, ... +Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.31798401 Perplexity = 1.3743543 COMPLETED diff --git a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt index bbba1a1b9..2dcc47bf6 100644 --- a/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt +++ b/Tests/EndToEndTests/Image/QuickE2E/baseline.linux.release.gpu.txt @@ -1,215 +1,247 @@ -running on localhost at 2015/11/23 11:51:07 +------------------------------------------------------------------- +Build info: + + Built time: Dec 18 2015 23:17:10 + Last modified date: Fri Dec 18 23:16:43 2015 + Build type: release + Math lib: acml + CUDA_PATH: /usr/local/cuda-7.0 + CUB_PATH: /usr/local/cub-1.4.1 + Build Branch: master + Build SHA1: f675c24ad6e803523212d772c27ae2c2c98b6ce9 +------------------------------------------------------------------- +running on localhost at 2015/12/18 23:25:20 command line: -/home/alexey/Projects/cntk/bin/cntk configFile=./QuickE2E/cntk.config DataDir=./Data RunDir=. ConfigDir=./QuickE2E DeviceId=0 +../../../bin/cntk configFile=QuickE2E/cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=_outgpu.txt >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=$DeviceId$ -ndlMacros=$ConfigDir$/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=$RunDir$/models/cntk.dnn - deviceId=$DeviceId$ - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = $DeviceId$ +ndlMacros = "$ConfigDir$/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "$RunDir$/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=$RunDir$/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl +test = [ + action = "test" + modelPath = "$RunDir$/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -DataDir=./Data -RunDir=. -ConfigDir=./QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_outgpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=0 -ndlMacros=./QuickE2E/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=./models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = Auto +ndlMacros = "QuickE2E/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=./models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl +test = [ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -DataDir=./Data -RunDir=. -ConfigDir=./QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=_outgpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -configparameters: cntk.config:command=Train:Test -configparameters: cntk.config:ConfigDir=./QuickE2E -configparameters: cntk.config:DataDir=./Data -configparameters: cntk.config:deviceId=0 -configparameters: cntk.config:ndlMacros=./QuickE2E/Macros.ndl +configparameters: cntk.config:command=train:test +configparameters: cntk.config:ConfigDir=QuickE2E +configparameters: cntk.config:DataDir=Data +configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl +configparameters: cntk.config:numCPUThreads=8 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=. -configparameters: cntk.config:Test=[ - action=test - modelPath=./models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl +configparameters: cntk.config:RunDir=_out +configparameters: cntk.config:stderr=_outgpu.txt +configparameters: cntk.config:test=[ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -configparameters: cntk.config:Train=[ - action=train - modelPath=./models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +configparameters: cntk.config:train=[ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< -command: Train Test +command: train test precision = float -CNTKModelPath: ./models/cntk.dnn -CNTKCommandTrainInfo: Train : 12 +Using 8 CPU threads +CNTKModelPath: _out/models/cntk.dnn +CNTKCommandTrainInfo: train : 12 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12 -CNTKCommandTrainBegin: Train +CNTKCommandTrainBegin: train +LockDevice: Locked GPU 0 to test availability. +LockDevice: Unlocked GPU 0 after testing. +LockDevice: Locked GPU 1 to test availability. +LockDevice: Unlocked GPU 1 after testing. +LockDevice: Locked GPU 2 to test availability. +LockDevice: Unlocked GPU 2 after testing. +LockDevice: Locked GPU 3 to test availability. +LockDevice: Unlocked GPU 3 after testing. +LockDevice: Locked GPU 0 for exclusive use. NDLBuilder Using GPU 0 -reading uci file ./Data/Train.txt +Reading UCI file Data/Train.txt +SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4 + +Post-processing network... + +3 roots: + outputNodes.z = Plus + ce = CrossEntropyWithSoftmax + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. +Validating for node outputNodes.z. 24 nodes to process in pass 1. - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -218,27 +250,25 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node outputNodes.z. 14 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -247,27 +277,25 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -276,292 +304,29 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node CE. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -Validating for node CE, final verification. - -Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 1] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -570,27 +335,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -599,27 +364,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -628,31 +393,30 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -661,27 +425,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -690,27 +454,27 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 1] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -719,35 +483,40 @@ Validating --> features = InputValue -> [784, MBSize 1] Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 1]) -> [784, MBSize 1] Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 1]) -> [9216, MBSize 1] Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 1] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 1]) -> [9216, MBSize 1] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 1]) -> [2304, MBSize 1] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 1]) -> [2048, MBSize 1] +Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 1], conv1_act.convB[16, 1]) -> [9216, MBSize 0] +Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] +Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] +Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 1], conv2_act.convB[32, 1]) -> [2048, MBSize 1] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 1]) -> [2048, MBSize 1] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 1]) -> [512, MBSize 1] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 1]) -> [128, MBSize 1] +Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] +Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] +Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] +Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 1], h1.b[128, 1]) -> [128, MBSize 1] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 1]) -> [128, MBSize 1] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 1]) -> [10, MBSize 1] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 1], OutputNodes.b[10, 1]) -> [10, MBSize 1] -Validating --> Err = ErrorPrediction(labels[10, MBSize 1], OutputNodes.z[10, MBSize 1]) -> [1, 1] +Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] +Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 1], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. -SetUniformRandomValue (GPU): creating curand object with seed 1 +Post-processing network complete. + SGD using GPU 0. -GetTrainCriterionNodes ... -GetEvalCriterionNodes ... + +Training criterion node(s): + ce = CrossEntropyWithSoftmax + +Evaluation criterion node(s): + err = ErrorPrediction -Allocating matrices for gradient computing +Allocating matrices for forward and/or backward propagation. No PreCompute nodes found, skipping PreCompute step Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 1: learning rate per sample = 0.005000 effective momentum = 0.000000 +Starting Epoch 1: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting at epoch 0 counting lines to determine record count 1000 records found @@ -755,119 +524,127 @@ starting epoch 0 at record count 0, and file position 0 already there from last epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 0: 38, 46, ... - Epoch[ 1 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.41911163; EvalErr[0]PerSample = 0.92000000; TotalTime = 0.10084s; TotalTimePerSample = 1.00839ms; SamplesPerSecond = 991 -Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.4191115; EvalErrPerSample = 0.91999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.114029 -Starting Epoch 2: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 38, 46, ... +MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 10 x 1 + Epoch[ 1 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.39150986; EvalErr[0]PerSample = 0.94000000; TotalTime = 0.5221s; SamplesPerSecond = 191.5 +Finished Epoch[ 1 of 12]: [Training Set] TrainLossPerSample = 2.3915098; EvalErrPerSample = 0.94; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.542948 +Starting Epoch 2: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 1 at record count 100, and file position 100 already there from last epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 1: 38, 46, ... - Epoch[ 2 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.38765198; EvalErr[0]PerSample = 0.89000000; TotalTime = 0.03237s; TotalTimePerSample = 0.32373ms; SamplesPerSecond = 3088 -Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.3876519; EvalErrPerSample = 0.88999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032542 -Starting Epoch 3: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 1: 38, 46, ... + Epoch[ 2 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.29544357; EvalErr[0]PerSample = 0.87000000; TotalTime = 0.0198s; SamplesPerSecond = 5052.8 +Finished Epoch[ 2 of 12]: [Training Set] TrainLossPerSample = 2.2954435; EvalErrPerSample = 0.87; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020015 +Starting Epoch 3: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 2 at record count 200, and file position 200 already there from last epoch Starting minibatch loop. -randomordering: 30 retries for 100 elements (30.0%) to ensure window condition -randomordering: recached sequence for seed 2: 34, 6, ... - Epoch[ 3 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.30177277; EvalErr[0]PerSample = 0.85000000; TotalTime = 0.03249s; TotalTimePerSample = 0.32492ms; SamplesPerSecond = 3077 -Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.3017728; EvalErrPerSample = 0.84999996; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032668 -Starting Epoch 4: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 30 retries for 100 elements (30.0%) to ensure window condition +RandomOrdering: recached sequence for seed 2: 34, 6, ... + Epoch[ 3 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 2.11703644; EvalErr[0]PerSample = 0.69000000; TotalTime = 0.0193s; SamplesPerSecond = 5173.0 +Finished Epoch[ 3 of 12]: [Training Set] TrainLossPerSample = 2.1170363; EvalErrPerSample = 0.69; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019492 +Starting Epoch 4: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 3 at record count 300, and file position 300 already there from last epoch Starting minibatch loop. -randomordering: 14 retries for 100 elements (14.0%) to ensure window condition -randomordering: recached sequence for seed 3: 35, 34, ... - Epoch[ 4 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.33002518; EvalErr[0]PerSample = 0.89000000; TotalTime = 0.03224s; TotalTimePerSample = 0.32243ms; SamplesPerSecond = 3101 -Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 2.3300252; EvalErrPerSample = 0.88999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032407 -Starting Epoch 5: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition +RandomOrdering: recached sequence for seed 3: 35, 34, ... + Epoch[ 4 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.99407211; EvalErr[0]PerSample = 0.65000000; TotalTime = 0.0197s; SamplesPerSecond = 5072.5 +Finished Epoch[ 4 of 12]: [Training Set] TrainLossPerSample = 1.9940721; EvalErrPerSample = 0.64999998; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019875 +Starting Epoch 5: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 4 at record count 400, and file position 400 already there from last epoch Starting minibatch loop. -randomordering: 13 retries for 100 elements (13.0%) to ensure window condition -randomordering: recached sequence for seed 4: 30, 23, ... - Epoch[ 5 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.23725708; EvalErr[0]PerSample = 0.88000000; TotalTime = 0.03227s; TotalTimePerSample = 0.32265ms; SamplesPerSecond = 3099 -Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 2.237257; EvalErrPerSample = 0.88; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.03243 -Starting Epoch 6: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 13 retries for 100 elements (13.0%) to ensure window condition +RandomOrdering: recached sequence for seed 4: 30, 23, ... + Epoch[ 5 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.72756805; EvalErr[0]PerSample = 0.45000000; TotalTime = 0.0228s; SamplesPerSecond = 4381.2 +Finished Epoch[ 5 of 12]: [Training Set] TrainLossPerSample = 1.727568; EvalErrPerSample = 0.44999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.023131 +Starting Epoch 6: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 5 at record count 500, and file position 500 already there from last epoch Starting minibatch loop. -randomordering: 25 retries for 100 elements (25.0%) to ensure window condition -randomordering: recached sequence for seed 5: 33, 43, ... - Epoch[ 6 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.24089386; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.03225s; TotalTimePerSample = 0.32247ms; SamplesPerSecond = 3101 -Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 2.2408938; EvalErrPerSample = 0.89999998; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032414 -Starting Epoch 7: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 25 retries for 100 elements (25.0%) to ensure window condition +RandomOrdering: recached sequence for seed 5: 33, 43, ... + Epoch[ 6 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.51963470; EvalErr[0]PerSample = 0.21000000; TotalTime = 0.0198s; SamplesPerSecond = 5047.2 +Finished Epoch[ 6 of 12]: [Training Set] TrainLossPerSample = 1.5196347; EvalErrPerSample = 0.20999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020056 +Starting Epoch 7: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 6 at record count 600, and file position 600 already there from last epoch Starting minibatch loop. -randomordering: 14 retries for 100 elements (14.0%) to ensure window condition -randomordering: recached sequence for seed 6: 12, 17, ... - Epoch[ 7 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.15189026; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.03228s; TotalTimePerSample = 0.32278ms; SamplesPerSecond = 3098 -Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 2.1518903; EvalErrPerSample = 0.79999995; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032436 -Starting Epoch 8: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition +RandomOrdering: recached sequence for seed 6: 12, 17, ... + Epoch[ 7 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 1.29057785; EvalErr[0]PerSample = 0.20000000; TotalTime = 0.0200s; SamplesPerSecond = 4995.5 +Finished Epoch[ 7 of 12]: [Training Set] TrainLossPerSample = 1.2905778; EvalErrPerSample = 0.19999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.020179 +Starting Epoch 8: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 7 at record count 700, and file position 700 already there from last epoch Starting minibatch loop. -randomordering: 14 retries for 100 elements (14.0%) to ensure window condition -randomordering: recached sequence for seed 7: 40, 7, ... - Epoch[ 8 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 2.02036377; EvalErr[0]PerSample = 0.68000000; TotalTime = 0.03236s; TotalTimePerSample = 0.32362ms; SamplesPerSecond = 3090 -Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 2.0203638; EvalErrPerSample = 0.68000001; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.032545 -Starting Epoch 9: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 14 retries for 100 elements (14.0%) to ensure window condition +RandomOrdering: recached sequence for seed 7: 40, 7, ... + Epoch[ 8 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.97353966; EvalErr[0]PerSample = 0.05000000; TotalTime = 0.0192s; SamplesPerSecond = 5198.3 +Finished Epoch[ 8 of 12]: [Training Set] TrainLossPerSample = 0.97353965; EvalErrPerSample = 0.049999997; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019408 +Starting Epoch 9: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 8 at record count 800, and file position 800 already there from last epoch Starting minibatch loop. -randomordering: 17 retries for 100 elements (17.0%) to ensure window condition -randomordering: recached sequence for seed 8: 8, 48, ... - Epoch[ 9 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.74879242; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.03041s; TotalTimePerSample = 0.30406ms; SamplesPerSecond = 3288 -Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 1.7487924; EvalErrPerSample = 0.44; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.030574 -Starting Epoch 10: learning rate per sample = 0.005000 effective momentum = 0.000000 +RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition +RandomOrdering: recached sequence for seed 8: 8, 48, ... + Epoch[ 9 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.73900352; EvalErr[0]PerSample = 0.03000000; TotalTime = 0.0192s; SamplesPerSecond = 5201.0 +Finished Epoch[ 9 of 12]: [Training Set] TrainLossPerSample = 0.73900348; EvalErrPerSample = 0.029999999; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019389 +Starting Epoch 10: learning rate per sample = 0.005000 effective momentum = 0.000000 momentum as time constant = 0.0 samples starting epoch 9 at record count 900, and file position 900 already there from last epoch Starting minibatch loop. -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 9: 14, 26, ... - Epoch[10 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.56006454; EvalErr[0]PerSample = 0.18000000; TotalTime = 0.03032s; TotalTimePerSample = 0.30320ms; SamplesPerSecond = 3298 -Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 1.5600646; EvalErrPerSample = 0.17999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.030483 -Starting Epoch 11: learning rate per sample = 0.005000 effective momentum = 0.700000 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 9: 14, 26, ... + Epoch[10 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.57409992; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0193s; SamplesPerSecond = 5188.9 +Finished Epoch[10 of 12]: [Training Set] TrainLossPerSample = 0.5740999; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019445 +Starting Epoch 11: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting epoch 10 at record count 1000, and file position 0 already there from last epoch Starting minibatch loop. -randomordering: 31 retries for 100 elements (31.0%) to ensure window condition -randomordering: recached sequence for seed 10: 22, 4, ... - Epoch[11 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.32553162; EvalErr[0]PerSample = 0.14000000; TotalTime = 0.03050s; TotalTimePerSample = 0.30496ms; SamplesPerSecond = 3279 -Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 1.3255316; EvalErrPerSample = 0.14; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.030661 -Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 +RandomOrdering: 31 retries for 100 elements (31.0%) to ensure window condition +RandomOrdering: recached sequence for seed 10: 22, 4, ... + Epoch[11 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.45136490; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0196s; SamplesPerSecond = 5107.5 +Finished Epoch[11 of 12]: [Training Set] TrainLossPerSample = 0.45136487; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019784 +Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting epoch 11 at record count 1100, and file position 100 already there from last epoch Starting minibatch loop. -randomordering: 17 retries for 100 elements (17.0%) to ensure window condition -randomordering: recached sequence for seed 11: 2, 40, ... - Epoch[12 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.01003433; EvalErr[0]PerSample = 0.03000000; TotalTime = 0.03054s; TotalTimePerSample = 0.30545ms; SamplesPerSecond = 3273 -Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0100343; EvalErrPerSample = 0.029999999; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.030702 -CNTKCommandTrainEnd: Train +RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition +RandomOrdering: recached sequence for seed 11: 2, 40, ... + Epoch[12 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.34551861; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.0194s; SamplesPerSecond = 5165.0 +Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.34551859; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.019532 +CNTKCommandTrainEnd: train + +Post-processing network... + +3 roots: + ce = CrossEntropyWithSoftmax + outputNodes.z = Plus + err = ErrorPrediction +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation -Allocating matrices for forward propagation. - - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -888,15 +665,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node ce. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -917,15 +694,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -946,19 +723,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -979,15 +754,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1008,15 +781,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1037,189 +808,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1240,15 +839,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node err. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1269,15 +868,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1298,326 +897,272 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. evalNodeNames are not specified, using all the default evalnodes and training criterion nodes. + + +Allocating matrices for forward and/or backward propagation. starting epoch 0 at record count 0, and file position 0 already there from last epoch -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 0: 38, 46, ... -Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.84035759 Perplexity = 2.3171954 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 38, 46, ... +MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 100 x 1 +Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.30270519 Perplexity = 1.3535154 COMPLETED === Deleting last epoch data ==== Re-running from checkpoint -running on localhost at 2015/11/23 11:52:51 +------------------------------------------------------------------- +Build info: + + Built time: Dec 18 2015 23:17:10 + Last modified date: Fri Dec 18 23:16:43 2015 + Build type: release + Math lib: acml + CUDA_PATH: /usr/local/cuda-7.0 + CUB_PATH: /usr/local/cub-1.4.1 + Build Branch: master + Build SHA1: f675c24ad6e803523212d772c27ae2c2c98b6ce9 +------------------------------------------------------------------- +running on localhost at 2015/12/18 23:27:59 command line: -/home/alexey/Projects/cntk/bin/cntk configFile=./QuickE2E/cntk.config DataDir=./Data RunDir=. ConfigDir=./QuickE2E DeviceId=0 +../../../bin/cntk configFile=QuickE2E/cntk.config ConfigDir=QuickE2E RunDir=_out DataDir=Data DeviceId=Auto stderr=gpu.txt >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=$DeviceId$ -ndlMacros=$ConfigDir$/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=$RunDir$/models/cntk.dnn - deviceId=$DeviceId$ - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = $DeviceId$ +ndlMacros = "$ConfigDir$/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "$RunDir$/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=$RunDir$/models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=$ConfigDir$/Convolution.ndl +test = [ + action = "test" + modelPath = "$RunDir$/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "$ConfigDir$/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=$DataDir$/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "$DataDir$/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=$DataDir$/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "$DataDir$/labelsmap.txt" ] ] ] -DataDir=./Data -RunDir=. -ConfigDir=./QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED) <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -precision=float -command=Train:Test -deviceId=0 -ndlMacros=./QuickE2E/Macros.ndl -parallelTrain=false -Train=[ - action=train - modelPath=./models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +precision = "float" +command = train:test +deviceId = Auto +ndlMacros = "QuickE2E/Macros.ndl" +parallelTrain = false +numCPUThreads = 8 +train = [ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -Test=[ - action=test - modelPath=./models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl +test = [ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -DataDir=./Data -RunDir=. -ConfigDir=./QuickE2E -DeviceId=0 +ConfigDir=QuickE2E +RunDir=_out +DataDir=Data +DeviceId=Auto +stderr=gpu.txt <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>> -configparameters: cntk.config:command=Train:Test -configparameters: cntk.config:ConfigDir=./QuickE2E -configparameters: cntk.config:DataDir=./Data -configparameters: cntk.config:deviceId=0 -configparameters: cntk.config:ndlMacros=./QuickE2E/Macros.ndl +configparameters: cntk.config:command=train:test +configparameters: cntk.config:ConfigDir=QuickE2E +configparameters: cntk.config:DataDir=Data +configparameters: cntk.config:deviceId=Auto +configparameters: cntk.config:ndlMacros=QuickE2E/Macros.ndl +configparameters: cntk.config:numCPUThreads=8 configparameters: cntk.config:parallelTrain=false configparameters: cntk.config:precision=float -configparameters: cntk.config:RunDir=. -configparameters: cntk.config:Test=[ - action=test - modelPath=./models/cntk.dnn - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl +configparameters: cntk.config:RunDir=_out +configparameters: cntk.config:stderr=gpu.txt +configparameters: cntk.config:test=[ + action = "test" + modelPath = "_out/models/cntk.dnn" + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Test.txt - features=[ - dim=784 - start=1 + reader = [ + readerType = "UCIFastReader" + file = "Data/Test.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] -configparameters: cntk.config:Train=[ - action=train - modelPath=./models/cntk.dnn - deviceId=0 - traceLevel=1 - NDLNetworkBuilder=[ - networkDescription=./QuickE2E/Convolution.ndl - ] - SGD=[ - epochSize=100 - minibatchSize=10 - learningRatesPerMB=0.05 - momentumPerMB=0*10:0.7 - maxEpochs=12 +configparameters: cntk.config:train=[ + action = "train" + modelPath = "_out/models/cntk.dnn" + traceLevel = 1 + NDLNetworkBuilder = [ + networkDescription = "QuickE2E/Convolution.ndl" ] - reader=[ - readerType=UCIFastReader - file=./Data/Train.txt - features=[ - dim=784 - start=1 + SGD = [ + epochSize = 100 + minibatchSize = 10 + learningRatesPerMB = 0.05 + momentumPerMB = 0*10:0.7 + maxEpochs = 12 + ] + reader = [ + readerType = "UCIFastReader" + file = "Data/Train.txt" + features = [ + dim = 784 + start = 1 ] - labels=[ - dim=1 - start=0 - labelDim=10 - labelMappingFile=./Data/labelsmap.txt + labels = [ + dim = 1 + start = 0 + labelDim = 10 + labelMappingFile = "Data/labelsmap.txt" ] ] ] <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<< -command: Train Test +command: train test precision = float -CNTKModelPath: ./models/cntk.dnn -CNTKCommandTrainInfo: Train : 12 +Using 8 CPU threads +CNTKModelPath: _out/models/cntk.dnn +CNTKCommandTrainInfo: train : 12 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 12 -CNTKCommandTrainBegin: Train +CNTKCommandTrainBegin: train +LockDevice: Locked GPU 0 to test availability. +LockDevice: Unlocked GPU 0 after testing. +LockDevice: Locked GPU 1 to test availability. +LockDevice: Unlocked GPU 1 after testing. +LockDevice: Locked GPU 2 to test availability. +LockDevice: Unlocked GPU 2 after testing. +LockDevice: Locked GPU 3 to test availability. +LockDevice: Unlocked GPU 3 after testing. +LockDevice: Locked GPU 0 for exclusive use. NDLBuilder Using GPU 0 -reading uci file ./Data/Train.txt -Starting from checkpoint. Load Network From File ./models/cntk.dnn.11. +Reading UCI file Data/Train.txt +Starting from checkpoint. Load Network From File _out/models/cntk.dnn.11. + +Post-processing network... + +3 roots: + err = ErrorPrediction + outputNodes.z = Plus + ce = CrossEntropyWithSoftmax +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation -Allocating matrices for forward propagation. - - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1638,15 +1183,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node err. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1667,15 +1212,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1696,19 +1241,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1729,15 +1272,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1758,15 +1299,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1787,189 +1326,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -1990,15 +1357,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2019,15 +1386,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2048,114 +1415,29 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. SGD using GPU 0. -GetTrainCriterionNodes ... -GetEvalCriterionNodes ... + +Training criterion node(s): + ce = CrossEntropyWithSoftmax + +Evaluation criterion node(s): + err = ErrorPrediction -Allocating matrices for gradient computing +Allocating matrices for forward and/or backward propagation. No PreCompute nodes found, skipping PreCompute step Warning: checkpoint file is missing. learning parameters will be initialized from 0 Set Max Temp Mem Size For Convolution Nodes to 0 samples. -Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 +Starting Epoch 12: learning rate per sample = 0.005000 effective momentum = 0.700000 momentum as time constant = 28.0 samples starting at epoch 11 counting lines to determine record count 1000 records found @@ -2163,20 +1445,28 @@ starting epoch 11 at record count 1100, and file position 100 reading from record 0 to 100 to be positioned properly for epoch Starting minibatch loop. -randomordering: 17 retries for 100 elements (17.0%) to ensure window condition -randomordering: recached sequence for seed 11: 2, 40, ... - Epoch[12 of 12]-Minibatch[ 1- 10 of 10]: SamplesSeen = 100; TrainLossPerSample = 1.01463303; EvalErr[0]PerSample = 0.02000000; TotalTime = 0.10305s; TotalTimePerSample = 1.03054ms; SamplesPerSecond = 970 -Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 1.0146331; EvalErrPerSample = 0.02; AvgLearningRatePerSample = 0.004999999888; EpochTime=0.117787 -CNTKCommandTrainEnd: Train +RandomOrdering: 17 retries for 100 elements (17.0%) to ensure window condition +RandomOrdering: recached sequence for seed 11: 2, 40, ... +MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 10 x 1 + Epoch[12 of 12]-Minibatch[ 1- 10, 100.00%]: SamplesSeen = 100; TrainLossPerSample = 0.34676910; EvalErr[0]PerSample = 0.00000000; TotalTime = 0.1298s; SamplesPerSecond = 770.6 +Finished Epoch[12 of 12]: [Training Set] TrainLossPerSample = 0.34676909; EvalErrPerSample = 0; AvgLearningRatePerSample = 0.0049999999; EpochTime=0.143116 +CNTKCommandTrainEnd: train + +Post-processing network... + +3 roots: + err = ErrorPrediction + outputNodes.z = Plus + ce = CrossEntropyWithSoftmax +FormNestedNetwork: WARNING: Was called twice for err ErrorPrediction operation +FormNestedNetwork: WARNING: Was called twice for outputNodes.z Plus operation +FormNestedNetwork: WARNING: Was called twice for ce CrossEntropyWithSoftmax operation -Allocating matrices for forward propagation. - - -Validating for node CE. 26 nodes to process in pass 1. +Validating for node err. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2197,15 +1487,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE. 15 nodes to process in pass 2. +Validating for node err. 15 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2226,15 +1516,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node CE, final verification. +Validating for node err, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2255,19 +1545,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> err = ErrorPrediction(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Validating for node outputNodes.z. 24 nodes to process in pass 1. -Validating for node CE. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2288,15 +1576,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE. 14 nodes to process in pass 2. +Validating for node outputNodes.z. 13 nodes to process in pass 2. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2317,15 +1603,13 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] -Validating for node CE, final verification. +Validating for node outputNodes.z, final verification. -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2346,189 +1630,17 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> CE = CrossEntropyWithSoftmax(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] 9 out of 24 nodes do not share the minibatch layout with the input data. - -Validating for node OutputNodes.z. 24 nodes to process in pass 1. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z. 13 nodes to process in pass 2. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -Validating for node OutputNodes.z, final verification. - -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] - -9 out of 24 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. +Validating for node ce. 26 nodes to process in pass 1. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2549,15 +1661,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err. 14 nodes to process in pass 2. +Validating for node ce. 14 nodes to process in pass 2. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2578,15 +1690,15 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] -Validating for node Err, final verification. +Validating for node ce, final verification. Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] +Validating --> outputNodes.W = LearnableParameter -> [10, 128] Validating --> h1.W = LearnableParameter -> [128, 512] Validating --> conv2_act.convW = LearnableParameter -> [32, 400] Validating --> conv1_act.convW = LearnableParameter -> [16, 25] @@ -2607,108 +1719,22 @@ Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0 Validating --> h1.b = LearnableParameter -> [128, 1] Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -10 out of 26 nodes do not share the minibatch layout with the input data. - - - -Validating for node Err. 26 nodes to process in pass 1. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err. 14 nodes to process in pass 2. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] - -Validating for node Err, final verification. - -Validating --> labels = InputValue -> [10, MBSize 0] -Validating --> OutputNodes.W = LearnableParameter -> [10, 128] -Validating --> h1.W = LearnableParameter -> [128, 512] -Validating --> conv2_act.convW = LearnableParameter -> [32, 400] -Validating --> conv1_act.convW = LearnableParameter -> [16, 25] -Validating --> featScale = LearnableParameter -> [1, 1] -Validating --> features = InputValue -> [784, MBSize 0] -Validating --> featScaled = Scale(featScale[1, 1], features[784 {W=28, H=28, C=1}, MBSize 0]) -> [784, MBSize 0] -Validating --> conv1_act.conv = Convolution(conv1_act.convW[16, 25], featScaled[784 {W=28, H=28, C=1}, MBSize 0]) -> [9216, MBSize 0] -Validating --> conv1_act.convB = LearnableParameter -> [16, 1] -Validating --> conv1_act.convPlusB = Plus(conv1_act.conv[9216 {W=24, H=24, C=16}, MBSize 0], conv1_act.convB[16, 1]) -> [9216, MBSize 0] -Validating --> conv1_act.act = RectifiedLinear(conv1_act.convPlusB[9216 {W=24, H=24, C=16}, MBSize 0]) -> [9216, MBSize 0] -Validating --> pool1 = MaxPooling(conv1_act.act[9216 {W=24, H=24, C=16}, MBSize 0]) -> [2304, MBSize 0] -Validating --> conv2_act.conv = Convolution(conv2_act.convW[32, 400], pool1[2304 {W=12, H=12, C=16}, MBSize 0]) -> [2048, MBSize 0] -Validating --> conv2_act.convB = LearnableParameter -> [32, 1] -Validating --> conv2_act.convPlusB = Plus(conv2_act.conv[2048 {W=8, H=8, C=32}, MBSize 0], conv2_act.convB[32, 1]) -> [2048, MBSize 0] -Validating --> conv2_act.act = RectifiedLinear(conv2_act.convPlusB[2048 {W=8, H=8, C=32}, MBSize 0]) -> [2048, MBSize 0] -Validating --> pool2 = AveragePooling(conv2_act.act[2048 {W=8, H=8, C=32}, MBSize 0]) -> [512, MBSize 0] -Validating --> h1.t = Times(h1.W[128, 512], pool2[512 {W=4, H=4, C=32}, MBSize 0]) -> [128, MBSize 0] -Validating --> h1.b = LearnableParameter -> [128, 1] -Validating --> h1.z = Plus(h1.t[128, MBSize 0], h1.b[128, 1]) -> [128, MBSize 0] -Validating --> h1.y = Sigmoid(h1.z[128, MBSize 0]) -> [128, MBSize 0] -Validating --> OutputNodes.t = Times(OutputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] -Validating --> OutputNodes.b = LearnableParameter -> [10, 1] -Validating --> OutputNodes.z = Plus(OutputNodes.t[10, MBSize 0], OutputNodes.b[10, 1]) -> [10, MBSize 0] -Validating --> Err = ErrorPrediction(labels[10, MBSize 0], OutputNodes.z[10, MBSize 0]) -> [1, 1] +Validating --> outputNodes.t = Times(outputNodes.W[10, 128], h1.y[128, MBSize 0]) -> [10, MBSize 0] +Validating --> outputNodes.b = LearnableParameter -> [10, 1] +Validating --> outputNodes.z = Plus(outputNodes.t[10, MBSize 0], outputNodes.b[10, 1]) -> [10, MBSize 0] +Validating --> ce = CrossEntropyWithSoftmax(labels[10, MBSize 0], outputNodes.z[10, MBSize 0]) -> [1, 1] 10 out of 26 nodes do not share the minibatch layout with the input data. +Post-processing network complete. evalNodeNames are not specified, using all the default evalnodes and training criterion nodes. + + +Allocating matrices for forward and/or backward propagation. starting epoch 0 at record count 0, and file position 0 already there from last epoch -randomordering: 21 retries for 100 elements (21.0%) to ensure window condition -randomordering: recached sequence for seed 0: 38, 46, ... -Final Results: Minibatch[1-1]: Samples Seen = 100 Err: ErrorPrediction/Sample = 0 CE: CrossEntropyWithSoftmax/Sample = 0.88667282 Perplexity = 2.427041 +RandomOrdering: 21 retries for 100 elements (21.0%) to ensure window condition +RandomOrdering: recached sequence for seed 0: 38, 46, ... +MBLayout::Init: Resizing m_distanceToStart from 1 x 0 to 100 x 1 +Final Results: Minibatch[1-1]: Samples Seen = 100 err: ErrorPrediction/Sample = 0 ce: CrossEntropyWithSoftmax/Sample = 0.31781933 Perplexity = 1.374128 COMPLETED