merged from master. Undid the ClassificationError baseline updates due to merge conflicts

2016-08-22 14:36:28 -07:00 · 2016-08-22 14:36:28 -07:00 · 5b969bac70
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -34,48 +34,29 @@
    <UseZip Condition="Exists('$(ZLIB_PATH)')">true</UseZip>
  </PropertyGroup>

-  <Choose>
-    <When Condition="Exists('$(ACML_PATH)')">
-      <PropertyGroup>
-        <MathLibrary>ACML</MathLibrary>
-        <MathLibraryName>ACML</MathLibraryName>
-        <MathIncludePath>$(ACML_PATH)\include</MathIncludePath>
-        <MathLibraryPath>$(ACML_PATH)\lib</MathLibraryPath>
-        <MathLinkLibrary>libacml_mp_dll.lib</MathLinkLibrary>
-        <MathDelayLoad>libacml_mp_dll.dll</MathDelayLoad>
-        <MathPostBuildCopyPattern>$(ACML_PATH)\lib\*.dll</MathPostBuildCopyPattern>
-        <UnitTestDlls>$(OutDir)libacml_mp_dll.dll;$(OutDir)libifcoremd.dll;$(OutDir)libifportmd.dll;$(OutDir)libiomp*.dll;$(OutDir)libmmd.dll;$(OutDir)svml_dispmd.dll;</UnitTestDlls>
-        <MathDefine>USE_ACML</MathDefine>
-      </PropertyGroup>
-    </When>
-
-    <!-- See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl on how to configure to build CNTK with MKL -->
-    <When Condition="'$(CNTK_MKL)' == '1'">
-      <PropertyGroup>
-        <MathLibrary>MKL</MathLibrary>
-        <CNTKCustomMKLVersion>1</CNTKCustomMKLVersion>
-        <CNTKCustomMKLPath>$(CNTK_MKL_PATH)\$(CNTKCustomMKLVersion)</CNTKCustomMKLPath>
-        <MathIncludePath>$(CNTKCustomMKLPath)\include</MathIncludePath>
-        <MathDefine>USE_MKL</MathDefine>
-      </PropertyGroup>
-      <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' != '1'">
-        <MathLibraryName>CNTK custom MKL Parallel (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
-        <MathLibraryPath>$(CNTKCustomMKLPath)\x64\parallel</MathLibraryPath>
-        <MathLinkLibrary>mkl_cntk_p.lib</MathLinkLibrary>
-        <MathDelayLoad>mkl_cntk_p.dll</MathDelayLoad>
-        <MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
-        <UnitTestDlls>$(OutDir)mkl_cntk_p.dll;$(OutDir)libiomp5md.dll;</UnitTestDlls>
-      </PropertyGroup>
-      <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' == '1'">
-        <MathLibraryName>CNTK custom MKL Sequential (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
-        <MathLibraryPath>$(CNTKCustomMKLPath)\x64\sequential</MathLibraryPath>
-        <MathLinkLibrary>mkl_cntk_s.lib</MathLinkLibrary>
-        <MathDelayLoad>mkl_cntk_s.dll</MathDelayLoad>
-        <MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
-        <UnitTestDlls>$(OutDir)mkl_cntk_s.dll;</UnitTestDlls>
-      </PropertyGroup>
-    </When>
-   </Choose>
+  <PropertyGroup>
+	<MathLibrary>MKL</MathLibrary>
+	<CNTKCustomMKLVersion>1</CNTKCustomMKLVersion>
+	<CNTKCustomMKLPath>$(CNTK_MKL_PATH)\$(CNTKCustomMKLVersion)</CNTKCustomMKLPath>
+	<MathIncludePath>$(CNTKCustomMKLPath)\include</MathIncludePath>
+	<MathDefine>USE_MKL</MathDefine>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' != '1'">
+	<MathLibraryName>CNTK custom MKL Parallel (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
+	<MathLibraryPath>$(CNTKCustomMKLPath)\x64\parallel</MathLibraryPath>
+	<MathLinkLibrary>mkl_cntk_p.lib</MathLinkLibrary>
+	<MathDelayLoad>mkl_cntk_p.dll</MathDelayLoad>
+	<MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
+	<UnitTestDlls>$(OutDir)mkl_cntk_p.dll;$(OutDir)libiomp5md.dll;</UnitTestDlls>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' == '1'">
+	<MathLibraryName>CNTK custom MKL Sequential (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
+	<MathLibraryPath>$(CNTKCustomMKLPath)\x64\sequential</MathLibraryPath>
+	<MathLinkLibrary>mkl_cntk_s.lib</MathLinkLibrary>
+	<MathDelayLoad>mkl_cntk_s.dll</MathDelayLoad>
+	<MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
+	<UnitTestDlls>$(OutDir)mkl_cntk_s.dll;</UnitTestDlls>
+  </PropertyGroup>

  <PropertyGroup Condition="$(UseZip)">
    <ZipInclude>$(ZLIB_PATH)\include;$(ZLIB_PATH)\lib\libzip\include;</ZipInclude>
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1150,6 +1150,9 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Examples\E
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BrainScriptTests", "Tests\UnitTests\BrainScriptTests\BrainScriptTests.vcxproj", "{9F999212-AFC5-4EAC-AA78-F7247D46C456}"
 	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
+		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
--- a/31
+++ b/31
@ -9,8 +9,6 @@
 # that provides
 #   BUILDTYPE= One of release or debug
 #     defaults to release
-#   ACML_PATH= path to ACML library installation
-#     only needed if MATHLIB=acml
 #   MKL_PATH= path to CNTK custom MKL installation
 #     only needed if MATHLIB=mkl
 #   CNTK_CUSTOM_MKL_VERSION=2
@ -21,8 +19,8 @@
 #     defaults to /usr/include/nvidia/gdk
 #   GDK_NVML_LIB_PATH= path to CUDA GDK (stub) library path, so $(GDK_NVML_LIB_PATH)/libnvidia-ml.so exists
 #     defaults to /usr/src/gdk/nvml/lib
-#   MATHLIB= One of acml or mkl
-#     defaults to acml
+#   MATHLIB= mkl
+#     defaults to mkl
 #   CUDA_PATH= Path to CUDA
 #     If not specified, GPU will not be enabled
 #   CUB_PATH= path to NVIDIA CUB installation, so $(CUB_PATH)/cub/cub.cuh exists
@ -60,8 +58,8 @@ BUILDTYPE=release
 endif

 ifndef MATHLIB
-$(info DEFAULTING MATHLIB=acml)
-MATHLIB = acml
+$(info DEFAULTING MATHLIB=mkl)
+MATHLIB = mkl
 endif

 #### Configure based on options above
@ -137,13 +135,6 @@ else
  COMMON_FLAGS +=-DCPUONLY
 endif

-ifeq ("$(MATHLIB)","acml")
-  INCLUDEPATH += $(ACML_PATH)/include
-  LIBPATH += $(ACML_PATH)/lib
-  LIBS += -lacml_mp -liomp5 -lm -lpthread
-  COMMON_FLAGS += -DUSE_ACML
-endif
-
 ifeq ("$(MATHLIB)","mkl")
  INCLUDEPATH += $(MKL_PATH)/$(CNTK_CUSTOM_MKL_VERSION)/include
  LIBS += -lm
@ -418,6 +409,10 @@ CNTKLIBRARY_TESTS_SRC =\
 	Tests/UnitTests/V2LibraryTests/TensorTests.cpp \
 	Tests/UnitTests/V2LibraryTests/TrainerTests.cpp \
 	Tests/UnitTests/V2LibraryTests/CifarResNet.cpp \
+	Tests/UnitTests/V2LibraryTests/SerializationTests.cpp \
+	Tests/UnitTests/V2LibraryTests/LearnerTests.cpp \
+	Tests/UnitTests/V2LibraryTests/FunctionTests.cpp \
+	Tests/UnitTests/V2LibraryTests/SequenceClassification.cpp \

 CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
 CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_TESTS_SRC)))
@ -933,22 +928,24 @@ UNITTEST_BRAINSCRIPT_SRC = \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/ParserTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/ComputationNetworkTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/stdafx.cpp

-UNITTEST_BRAINSCRIPT_SRC+=$(COMMON_SRC)
+UNITTEST_BRAINSCRIPT_SRC += $(COMPUTATION_NETWORK_LIB_SRC)
+UNITTEST_BRAINSCRIPT_SRC += $(SEQUENCE_TRAINING_LIB_SRC)

-UNITTEST_BRAINSCRIPT_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_BRAINSCRIPT_SRC))
+UNITTEST_BRAINSCRIPT_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_BRAINSCRIPT_SRC)))

 UNITTEST_BRAINSCRIPT := $(BINDIR)/brainscripttests

 ALL += $(UNITTEST_BRAINSCRIPT)
 SRC += $(UNITTEST_BRAINSCRIPT_SRC)

-$(UNITTEST_BRAINSCRIPT): $(UNITTEST_BRAINSCRIPT_OBJ)
+$(UNITTEST_BRAINSCRIPT): $(UNITTEST_BRAINSCRIPT_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -ldl
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -ldl -l$(CNTKMATH)

 unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH) $(UNITTEST_BRAINSCRIPT)

--- a/Source/ActionsLib/TrainActions.cpp
+++ b/Source/ActionsLib/TrainActions.cpp
@ -72,18 +72,6 @@ void DoTrain(const ConfigRecordType& config)
    bool makeMode = config(L"makeMode", true);
    DEVICEID_TYPE deviceId = DeviceFromConfig(config);

-    // determine the network-creation function
-    // We have several ways to create that network.
-    function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn;
-
-    createNetworkFn = GetNetworkFactory<ConfigRecordType, ElemType>(config);
-
-    auto dataReader = CreateObject<DataReader>(config, L"reader");
-
-    shared_ptr<DataReader> cvDataReader;
-    if (config.Exists(L"cvReader"))
-        cvDataReader = CreateObject<DataReader>(config, L"cvReader");
-
    shared_ptr<SGD<ElemType>> optimizer;
    if (config.Exists(L"optimizer"))
    {
@ -95,8 +83,39 @@ void DoTrain(const ConfigRecordType& config)
        optimizer = make_shared<SGD<ElemType>>(configSGD);
    }

+    // determine which epoch to start with, including recovering a checkpoint if any and 'makeMode' enabled
+    int startEpoch = optimizer->DetermineStartEpoch(makeMode);
+    if (startEpoch == optimizer->GetMaxEpochs())
+    {
+        LOGPRINTF(stderr, "No further training is necessary.\n");
+        return;
+    }
+
+    wstring modelFileName = optimizer->GetModelNameForEpoch(int(startEpoch) - 1);
+    bool loadNetworkFromCheckpoint = startEpoch >= 0;
+    fprintf(stderr, "\n");
+    if (loadNetworkFromCheckpoint)
+        LOGPRINTF(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
+    else
+        LOGPRINTF(stderr, "Creating virgin network.\n");
+
+    // determine the network-creation function
+    // We have several ways to create that network.
+    function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn;
+
+    createNetworkFn = GetNetworkFactory<ConfigRecordType, ElemType>(config);
+
+    // create or load from checkpoint
+    shared_ptr<ComputationNetwork> net = !loadNetworkFromCheckpoint ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
+
+    auto dataReader = CreateObject<DataReader>(config, L"reader");
+
+    shared_ptr<DataReader> cvDataReader;
+    if (config.Exists(L"cvReader"))
+        cvDataReader = CreateObject<DataReader>(config, L"cvReader");
+
    optimizer->InitMPI(MPIWrapper::GetInstance());
-    optimizer->Train(createNetworkFn, deviceId, dataReader.get(), cvDataReader.get(), makeMode);
+    optimizer->Train(net, deviceId, dataReader.get(), cvDataReader.get(), startEpoch, loadNetworkFromCheckpoint);
 }

 namespace Microsoft { namespace MSR { namespace ScriptableObjects {
@ -189,9 +208,8 @@ void DoDumpNodes(const ConfigParameters& config)
    if (!printValues && !printMetadata)
        InvalidArgument("printValues and printMetadata: Since both are set to false, there will be nothing to dump");

-    ComputationNetwork net(CPUDEVICE);    // always use CPU
-    net.Load<ElemType>(modelPath); // TODO: we have a function now to combine this and the previous line
-    net.DumpNodeInfoToFile(nodeName, printValues, printMetadata, outputFile, nodeNameRegexStr);
+    ComputationNetworkPtr net = ComputationNetwork::CreateFromFile<ElemType>(CPUDEVICE, modelPath);
+    net->DumpNodeInfoToFile(nodeName, printValues, printMetadata, outputFile, nodeNameRegexStr);
 }

 template void DoDumpNodes<float>(const ConfigParameters& config);
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -10,7 +10,7 @@
 #include "stdafx.h"
 #ifdef _WIN32
 #include <crtdbg.h>
-#endif
+#endif 

 #include "Basics.h"
 #include "Actions.h"
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@ -53,8 +53,6 @@ if "%p_CNTK_MKL%" == "1" (
  ) else (
    echo #define _MATHLIB_ "mkl">> buildinfo.h$$
  )
-) else (
-  echo #define _MATHLIB_ "acml">> buildinfo.h$$
 )

 echo #define _BUILDER_ "%USERNAME%"     >> buildinfo.h$$
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -7,6 +7,12 @@

 #pragma once

+#ifdef SWIG
+#define final
+#define explicit
+#define static_assert(condition, message)
+#endif
+
 #include "CNTKLibraryInternals.h"

 #include <memory>
@ -14,10 +20,12 @@
 #include <array>
 #include <stdarg.h>
 #include <assert.h>
+#include <map>
 #include <unordered_map>
 #include <unordered_set>
 #include <string>
 #include <sstream>
+#include <iosfwd>
 #include<algorithm>

 namespace CNTK
@ -236,7 +244,7 @@ namespace CNTK
        }

        ///
-        /// Creates and returns a new shape contructed by appending the dimensions of the specified 'shape' to 'this' shape's dimensions.
+        /// Creates and returns a new shape constructed by appending the dimensions of the specified 'shape' to 'this' shape's dimensions.
        ///
        NDShape AppendShape(const NDShape& shape) const
        {
@ -665,35 +673,52 @@ namespace CNTK

    ///
    /// Denotes an Axis of a Variable and is used for specifying the axes parameters of certain Functions such as reductions.
-    /// Besides the static axes corresponding to each of the axes of the Variable's shape, Input and Output Variables
-    /// also have one or more dynamic axes (corresponding to the sequence dimensions) and one implicit batch axis denoting the axes 
-    /// along which multiple sequences are batched in the Values corresponding to the variable when performing computations.
+    /// Besides the static axes corresponding to each of the axes of the Variable's shape, Variables of kind 'Input' and any 
+    /// 'Output' Variables dependent on an 'Input' Variable also have 2 additional dynamic axes whose dimensions are known only 
+    /// when the Variable is bound to actual data during compute (viz. sequence axis and batch axis denoting the axis along which
+    /// multiple sequences are batched)
    ///
    class Axis final
    {
+        CNTK_API static const std::wstring s_staticAxisNamePrefix;
    public:
        ///
        /// Construct an Axis object denoting a static axis with the specified index.
        ///
-        Axis(size_t staticAxisIdx)
+        explicit Axis(size_t staticAxisIdx)
            : m_staticAxisIdx(staticAxisIdx)
        {
-            const wchar_t* staticAxisNamePrefix = L"staticAxis_";
-            m_name = staticAxisNamePrefix + std::to_wstring(staticAxisIdx);
+            m_name = s_staticAxisNamePrefix + std::to_wstring(staticAxisIdx);
        }

        ///
        /// Construct a dynamic axis with the specified name.
        ///
-        Axis(const std::wstring& name)
+        explicit Axis(const std::wstring& name)
            : m_staticAxisIdx(SIZE_MAX), m_name(name)
        {
+            if (m_name.length() > s_staticAxisNamePrefix.length())
+            {
+                auto prefix = m_name.substr(0, s_staticAxisNamePrefix.length());
+                auto suffix = m_name.substr(s_staticAxisNamePrefix.length(), m_name.length() - s_staticAxisNamePrefix.length());
+                if (prefix == s_staticAxisNamePrefix)
+                {
+                    if (suffix == L"0")
+                        *this = Axis(0);
+                    else
+                    {
+                        auto suffixVal = std::stoul(suffix);
+                        if (suffixVal != 0)
+                            *this = Axis(suffixVal);
+                    }
+                }
+            }
        }

        ///
        /// Returns a boolean indicating if 'this' Axis corresponds to a static axis
        ///
-        bool IsStaticAxis() const { return m_staticAxisIdx == SIZE_MAX; }
+        bool IsStaticAxis() const { return m_staticAxisIdx != SIZE_MAX; }

        ///
        /// Returns the axis index if 'this' Axis is a static axis. Throws an exception otherwise.
@ -714,12 +739,7 @@ namespace CNTK
        ///
        /// Static Axis object representing the batch axis.
        ///
-        CNTK_API static const Axis& BatchAxis();
-
-        ///
-        /// Special Axis object denoting all the axes of the Value object in whose context it is used.
-        ///
-        CNTK_API static const Axis& AllAxes();
+        CNTK_API static const Axis& DefaultBatchAxis();

        ///
        /// Name of 'this' axis
@ -753,7 +773,20 @@ namespace CNTK
    {
        return !(first == second);
    }
+}

+namespace std {
+    template <> struct hash<CNTK::Axis>
+    {
+        size_t operator()(const CNTK::Axis& x) const
+        {
+            return std::hash<std::wstring>()(x.Name());
+        }
+    };
+}
+
+namespace CNTK
+{
    ///
    /// Enumeration type denoting the kind of a symbolic Variable object
    ///
@ -780,47 +813,76 @@ namespace CNTK
        template <typename T>
        friend struct std::hash;

+        CNTK_API static const std::vector<Axis> s_defaultInputVariableDynamicAxes;
    public:
        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType)
-            : Variable(shape, dataType, L"")
+        Variable(const NDShape& shape, CNTK::DataType dataType, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, dataType, L"", dynamicAxes)
        {}

        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name)
-            : Variable(shape, dataType, std::wstring(name))
+        Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, dataType, std::wstring(name), dynamicAxes)
        {}

        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name)
-            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, false, { Axis::DefaultDynamicAxis() }, false, name)
+        Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, false, dataType, name, dynamicAxes)
        {}

        ///
        /// Create an 'Input' Variable denoting sparse data.
        ///
-        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, const std::wstring& name = L"")
-            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, false, { Axis::DefaultDynamicAxis() }, isSparse, name)
+        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, isSparse, dataType, false, L"", dynamicAxes)
+        {}
+
+        ///
+        /// Create an 'Input' Variable denoting sparse data.
+        ///
+        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, const wchar_t* name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, isSparse, dataType, std::wstring(name), dynamicAxes)
+        {}
+
+        ///
+        /// Create an 'Input' Variable denoting sparse data.
+        ///
+        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, const std::wstring& name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, isSparse, dataType, false, name, dynamicAxes)
        {}

        ///
        /// Create an 'Input' Variable and specify if gradients are to be computed for this input
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, bool needsGradient, const std::wstring& name = L"")
-            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, needsGradient, { Axis::DefaultDynamicAxis() }, false, name)
+        Variable(const NDShape& shape, CNTK::DataType dataType, bool needsGradient, const wchar_t* name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, dataType, needsGradient, std::wstring(name), dynamicAxes)
+        {}
+
+        ///
+        /// Create an 'Input' Variable and specify if gradients are to be computed for this input
+        ///
+        Variable(const NDShape& shape, CNTK::DataType dataType, bool needsGradient, const std::wstring& name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, false, dataType, needsGradient, name, dynamicAxes)
        {}

        ///
        /// Create an 'Input' Variable denoting sparse data and specify if gradients are to be computed for this input
        ///
-        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, bool needsGradient, const std::wstring& name = L"")
-            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, needsGradient, { Axis::DefaultDynamicAxis() }, isSparse, name)
+        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, bool needsGradient, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, isSparse, dataType, needsGradient, L"", dynamicAxes)
+        {}
+
+        ///
+        /// Create an 'Input' Variable denoting sparse data and specify if gradients are to be computed for this input
+        ///
+        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, bool needsGradient, const std::wstring& name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
+            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, needsGradient, dynamicAxes, isSparse, name)
        {}

        ///
@ -860,7 +922,7 @@ namespace CNTK
        ///
        /// Returns a boolean value indicating if 'this' variable denotes sparse data
        ///
-        bool IsSparse() const { return (m_dataFields->m_isSparse); }
+        bool IsSparse() const { return m_dataFields->m_isSparse; }

        ///
        /// Returns a boolean value indicating if 'this' variable is an Input
@ -941,6 +1003,14 @@ namespace CNTK
            VariableFields(const NDShape& shape, VariableKind varType, CNTK::DataType type, Function* ownerFunction, const NDArrayViewPtr& value, bool needsGradient, const std::vector<Axis>& dynamicAxes, bool isSparse, const std::wstring& name)
                : m_shape(shape), m_varKind(varType), m_dataType(type), m_ownerFunction(ownerFunction), m_value(value), m_needsGradient(needsGradient), m_dynamicAxes(dynamicAxes), m_isSparse(isSparse), m_name(name)
            {
+                // Validate that each of the dynamic axes are unique
+                std::unordered_set<Axis> uniqueDynamicAxis;
+                for (auto& currentDynamicAxis : dynamicAxes)
+                {
+                    auto retVal = uniqueDynamicAxis.insert(currentDynamicAxis);
+                    if (!retVal.second)
+                        InvalidArgument("Dynamic axis named %S is specified more than once for Variable object", currentDynamicAxis.Name().c_str());
+                }
            }

        private:
@ -1079,7 +1149,7 @@ namespace CNTK
        /// Contruct a Placeholder with the specified NDShape
        ///
        explicit Placeholder(const NDShape& shape, const std::wstring& name = L"")
-            : Variable(shape, VariableKind::Placeholder, DataType::Unknown, nullptr, false, {Axis::DefaultDynamicAxis()}, name)
+            : Variable(shape, VariableKind::Placeholder, DataType::Unknown, nullptr, false, { Axis::DefaultDynamicAxis(), Axis::DefaultBatchAxis() }, name)
        {}

        ///
@ -1097,13 +1167,15 @@ namespace CNTK
 }

 namespace std {
-    template <> struct hash<CNTK::Axis>
+    
+    template <> struct hash<CNTK::NDShape>
    {
-        size_t operator()(const CNTK::Axis& x) const
+        size_t operator()(const CNTK::NDShape& x) const
        {
-            return std::hash<std::wstring>()(x.Name());
+            return std::hash<std::wstring>()(x.AsString());
        }
    };
+
    
    template <> struct hash<CNTK::Variable>
    {
@ -1441,6 +1513,21 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr Softmax(const Variable& operand, const std::wstring& name = L"");

+    ///
+    /// Create an instance of the CNTK built-in hardmax operation on specified tensor input operand
+    ///
+    CNTK_API FunctionPtr Hardmax(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in transpose dimensions operation on specified tensor input operand
+    ///
+    CNTK_API FunctionPtr TransposeAxes(const Variable& operand, const Axis& axis1, const Axis& axis2, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the slice operation on specified tensor input operand
+    ///
+    CNTK_API FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name = L"");
+
    ///
    /// Create an instance of the CNTK built-in elementwise tensor addition operation with the specified input operands.
    ///
@ -1497,6 +1584,13 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes = 1, const std::wstring& name = L"");

+    ///
+    /// Create an instance of the CNTK built-in matrix multiplication operation with the transpose of the left input operand
+    /// and the specified right operand. Only accepts left operands of ranks 1 or 2.
+    /// TODO: Specify the constraints on the shapes of the operands.
+    ///
+    CNTK_API FunctionPtr TransposeTimes(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes = 1, const std::wstring& name = L"");
+
    ///
    /// Create an instance of the CNTK built-in operation to compute squared-error for specified input operands.
    ///
@ -1518,7 +1612,6 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name = L"");

-    //CNTK_API FunctionPtr PastValue(const Variable& initialState, const Variable& operand, Axis axis, const std::wstring& name = L"");

    ///
    /// Create an instance of the CNTK built-in operation for getting the future value along the lone dynamic axis of the specified operand.
@ -1532,6 +1625,16 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name = L"");

+    ///
+    /// Create an instance of the CNTK built-in sum reduction operation on specified tensor input operand along the specified axis
+    ///
+    CNTK_API FunctionPtr ReduceSum(const Variable& operand, const Axis& axis, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in LogSum reduction operation on specified tensor input operand along the specified axis
+    ///
+    CNTK_API FunctionPtr ReduceLogSum(const Variable& operand, const Axis& axis, const std::wstring& name = L"");
+
    ///
    /// Per dimension mean-variance normalization of the specified input operand.
    ///
@ -1630,6 +1733,7 @@ namespace CNTK
            NDShape,
            Vector,
            Dictionary,
+            NDArrayView,
        };

        static const char* TypeName(Type type)
@ -1654,6 +1758,8 @@ namespace CNTK
                return "Vector";
            case Type::Dictionary:
                return "Dictionary";
+            case Type::NDArrayView:
+                return "NDArrayView";
            default:
                LogicError("Unknown DictionaryValue::Type");
            }
@ -1687,13 +1793,21 @@ namespace CNTK
        DictionaryValue(const wchar_t* value) 
            : DictionaryValue(std::wstring(value))
        {}
+
+        // Due to SWIG we had to flatten this template for vector<DictionaryValue>
+        DictionaryValue(const std::vector<CNTK::DictionaryValue>& value) : m_valueType(GetValueType<std::vector<CNTK::DictionaryValue>>())
+        {
+            AllocateDataPtr(value);
+        }
+
        template <typename T>
        DictionaryValue(const T& value) : m_valueType(GetValueType<T>())
        {
-            static_assert(std::is_same<T, NDShape>::value ||
-                          std::is_same<T, std::wstring>::value ||
-                          std::is_same<T, std::vector<DictionaryValue>>::value ||
-                          std::is_same<T, Dictionary>::value,
+            static_assert((std::is_same<T, NDShape>::value ||
+                std::is_same<T, std::wstring>::value ||
+                std::is_same<T, std::vector<DictionaryValue>>::value ||
+                std::is_same<T, Dictionary>::value ||
+                std::is_same<T, NDArrayView>::value),
                          "Unsupported ValueType");

            AllocateDataPtr(value);
@ -1706,6 +1820,12 @@ namespace CNTK
            *this = other;
        }

+        DictionaryValue(DictionaryValue&& other) : m_valueType(Type::Bool)
+        {
+            // The m_valueType must have been set to a non-ptr type to prevent an attempt to interpret
+            // the underlying underlying uninitialized value as a ptr and free it.
+            *this = std::move(other);
+        }
        DictionaryValue& operator=(const DictionaryValue& other)
        {
            if (this != &other)
@ -1723,11 +1843,33 @@ namespace CNTK
                    AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
                else if (other.m_valueType == Type::Dictionary)
                    AllocateDataPtr(other.GetValue<Dictionary>());
+                else if (other.m_valueType == Type::NDArrayView)
+                    AllocateDataPtr(other.GetValue<NDArrayView>());
            }

            return *this;
        }

+        DictionaryValue& operator=(DictionaryValue&& other)
+        {
+            FreeDataPtr();
+
+            m_valueType = other.m_valueType;
+            m_data = other.m_data;
+
+            if (other.m_valueType == Type::String ||
+                other.m_valueType == Type::NDShape ||
+                other.m_valueType == Type::Vector ||
+                other.m_valueType == Type::Dictionary ||
+                other.m_valueType == Type::NDArrayView)
+            {
+                other.m_data.m_ptr = nullptr;
+            }
+
+            other.m_valueType = Type::None;
+
+            return *this;
+        }
        ~DictionaryValue()
        {
            FreeDataPtr();
@ -1764,7 +1906,8 @@ namespace CNTK
        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value ||
            std::is_same<T, std::wstring>::value ||
            std::is_same<T, std::vector<DictionaryValue>>::value ||
-            std::is_same<T, Dictionary>::value>::type* = nullptr>
+            std::is_same<T, Dictionary>::value ||
+            std::is_same<T, NDArrayView>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
@ -1781,21 +1924,25 @@ namespace CNTK
            return m_valueType;
        }

-        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us);
-        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us);
+        CNTK_API bool operator==(const DictionaryValue& other) const;
+        CNTK_API bool operator!=(const DictionaryValue& other) const;
+
+        friend CNTK_API std::istream& operator>>(std::istream& stream, DictionaryValue& us);
+        friend CNTK_API std::ostream& operator<<(std::ostream& stream, const DictionaryValue& us);

    private:
        template <typename T>
        static Type GetValueType()
        {
-            static_assert(std::is_same<T, bool>::value ||
+            static_assert((std::is_same<T, bool>::value ||
                          std::is_same<T, size_t>::value ||
                          std::is_same<T, float>::value ||
                          std::is_same<T, double>::value ||
                          std::is_same<T, std::wstring>::value ||
                          std::is_same<T, NDShape>::value ||
-                          std::is_same<T, std::vector<DictionaryValue>>::value ||
-                          std::is_same<T, Dictionary>::value,
+                std::is_same<T, std::vector<DictionaryValue>>::value ||
+                          std::is_same<T, Dictionary>::value ||
+                          std::is_same<T, NDArrayView>::value),
                          "Unsupported ValueType");

            if (std::is_same<T, bool>::value)                                      return Type::Bool;
@ -1806,6 +1953,7 @@ namespace CNTK
            if (std::is_same<T, NDShape>::value)                                   return Type::NDShape;
            if (std::is_same<T, std::vector<DictionaryValue>>::value)              return Type::Vector;
            if (std::is_same<T, Dictionary>::value)                                return Type::Dictionary;
+            if (std::is_same<T, NDArrayView>::value)                               return Type::NDArrayView;
        }

        template <typename T>
@ -1831,6 +1979,8 @@ namespace CNTK
                FreePtrAsType<std::vector<DictionaryValue>>();
            else if (m_valueType == Type::Dictionary)
                FreePtrAsType<Dictionary>();
+            else if (m_valueType == Type::Dictionary)
+                FreePtrAsType<NDArrayView>();
        }

        Type m_valueType;
@ -1884,9 +2034,11 @@ namespace CNTK
            return Contains(key.c_str());
        }

+        CNTK_API bool operator==(const Dictionary& other) const;
+        CNTK_API bool operator!=(const Dictionary& other) const;

-        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us);
-        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us);
+        friend CNTK_API std::istream& operator>>(std::istream& stream, Dictionary& us);
+        friend CNTK_API std::ostream& operator<<(std::ostream& stream, const Dictionary& us);

    private:
        std::shared_ptr<std::unordered_map<std::wstring, DictionaryValue>> m_dictionaryData;
@ -1924,6 +2076,9 @@ namespace CNTK
        ///
        CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& /*checkpoint*/) {}

+        ///
+        /// Destruct this Learner.
+        ///
        virtual ~Learner() {}

    protected:
@ -1935,37 +2090,127 @@ namespace CNTK

    };

+    ///
+    /// A collection of key-value pairs that represents training parameter schedule in 
+    /// terms of the number of processed samples. 
+    /// This class provides a number of convenience constructors to allow easy conversion 
+    /// from a single value, a vector of values and a list of pairs to the training schedule.
+    ///
+    template <typename T>
+    class TrainingParameterSchedule
+    {
+    public:
+        ///
+        /// Create a schedule with a constant parameter value.
+        ///
+        TrainingParameterSchedule(T value)
+            : m_schedule({ std::make_pair(0, value) }), m_unit(1)
+        {}
+
+        ///
+        /// Create a schedule where the parameter changes its value every 'unit' samples:
+        /// schedule[0] is used for the first 'unit' samples, schedule[1] -- for the second,
+        /// and so on. The last value is then used repeatedly until the end of training.
+        ///
+        TrainingParameterSchedule(const std::vector<T>& schedule, size_t unit = 1) 
+            : m_unit(unit)
+        {
+            // TODO: 0 will be used to mean "the entire sweep"
+            if (unit == 0)
+                RuntimeError("TrainingParameterSchedule::constructor : 'unit' cannot be 0.");
+
+            if (schedule.size() == 0)
+                RuntimeError("TrainingParameterSchedule::constructor : schedule is empty.");
+
+            size_t i = 1;
+            for (const auto& value : schedule)
+            {
+                m_schedule[m_unit * i++] = value;
+            }
+        }
+
+        ///
+        /// Create a schedule using the list of key-value pairs, where the key specifies 
+        /// the number of 'units' the parameter should maintain the corresponding value.
+        /// The value from the last pair is used repeatedly until the end of training.
+        /// For example, {{1, 0.05}, {2, 0.1}, {1, 0.005}} and unit = 100, corresponds to 
+        /// a schedule where the value of '0.05' is used for the first 100 samples, then
+        /// '0.1' is used for the second 200 samples, after which the values is switched
+        /// to '0.005'.
+        ///
+        TrainingParameterSchedule(const std::initializer_list<std::pair<const size_t, T>>& schedule, size_t unit = 1)
+            : m_unit(unit)
+        {
+            // TODO: 0 will be used to mean "the entire sweep"
+            if (unit == 0)
+                RuntimeError("TrainingParameterSchedule::constructor : 'unit' cannot be 0.");
+
+            if (schedule.size() == 0)
+                RuntimeError("TrainingParameterSchedule::constructor : schedule is empty.");
+
+            size_t i = 0;
+            for (const auto& it : schedule)
+            {
+                if (it.first == 0)
+                    RuntimeError("TrainingParameterSchedule::constructor : unit count cannot be 0.");
+
+                i += it.first;
+                m_schedule[m_unit * i] = it.second;
+            }
+        }
+
+        ///
+        /// Returns a value corresponding to the absolute sample count from the beginning of training.
+        ///
+        CNTK_API const T& operator[](size_t samleCount) const;
+
+    private:
+        std::map<size_t, T> m_schedule;
+        size_t m_unit;
+    };
+
+    typedef TrainingParameterSchedule<double> LearningRatesPerSample;
+    typedef TrainingParameterSchedule<double> MomentumsPerSample;
+
    ///
    /// Create an instance of the CNTK built-in SGD learner.
    ///
-    /// TODO: add additional SGD parameters here (a collection of learning rate values)
-    CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, double learningRatePerSample);
+    CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, 
+                                   const LearningRatesPerSample& learningRates);

    ///
    /// Create an instance of the CNTK built-in Momentum SGD learner.
    ///
-    /// TODO: add additional Momentum parameters here (a collection of momentum rate values)
-    CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters);
+    CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters, 
+                                           const LearningRatesPerSample& learningRates,
+                                           const MomentumsPerSample& momentums);

    ///
    /// Create an instance of the CNTK built-in Nesterov's accelerated SGD learner.
    ///
-    CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters);
+    CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters, 
+                                        const LearningRatesPerSample& learningRates,
+                                        const MomentumsPerSample& momentums);

    ///
    /// Create an instance of the CNTK built-in AdaGrad learner.
    ///
-    CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier = true);
+    CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters,
+                                       const LearningRatesPerSample& learningRates,
+                                       bool needAveMultiplier = true);

    ///
    /// Create an instance of the CNTK built-in FSAdaGrad (improved AdaGrad) learner.
    ///
-    CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters);
+    CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters,
+                                         const LearningRatesPerSample& learningRates,
+                                         const MomentumsPerSample& momentums);

    ///
    /// Create an instance of the CNTK built-in RMSProp learner.
    ///
    CNTK_API LearnerPtr RMSPropLearner(const std::unordered_set<Parameter>& parameters,
+                                       const LearningRatesPerSample& learningRates,
                                       double gamma,
                                       double inc,
                                       double dec,
@ -1975,7 +2220,7 @@ namespace CNTK

    ///
    /// Trainer is the top-level abstraction responsible for the orchestration of the training of a model
-    /// using the specified learners and training data either explicilty supplied as Value objects or from
+    /// using the specified learners and training data either explicitly supplied as Value objects or from
    /// a MinibatchSource object.
    ///
    class Trainer
@ -2063,7 +2308,7 @@ namespace CNTK
    };

    ///
-    /// Abstraction for generating minbatches of samples for training/evaluation.
+    /// Abstraction for generating minibatches of samples for training/evaluation.
    ///
    class MinibatchSource : public std::enable_shared_from_this<MinibatchSource>
    {
@ -2079,10 +2324,14 @@ namespace CNTK
        /// #samples or both. In case the size is specified in terms of both #sequences and #samples, the smaller of the 2 is taken. The actual
        /// returned size of the minibatch is the min across all streams. Also the requested MB size fields in the maps are updated by the 
        /// MinibatchSource to contain the actual #sequences and #samples in the returned minibatch for the corresponding stream.
-        /// The return value indciates if the MinibatchSource will return any further data in subsequent calls of this function.
+        /// The return value indicates if the MinibatchSource will return any further data in subsequent calls of this function.
        ///
-        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+        virtual const std::unordered_map<StreamInfo, MinibatchData>& GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) = 0;
+        ///
+        /// Destruct this MinibatchSource.
+        ///
+        virtual ~MinibatchSource() {}

        // TODO: Methods to save and restore from checkpoints

--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -7,6 +7,12 @@

 #pragma once

+#ifdef SWIG
+#define final
+#define explicit
+#define static_assert(condition, message)
+#endif
+
 #ifdef _WIN32
 #ifdef CNTKV2LIBRARYDLL
 #define CNTK_API __declspec(dllexport)
@ -47,8 +53,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template <typename ElementType>
    class ComputationNode;
-
-    class File;
 }}}

 // TODO: The following should be reconciled with the equivalent code in the CNTK implementation
@ -133,7 +137,7 @@ namespace CNTK
 #define NOT_IMPLEMENTED                                                                                                              \
    {                                                                                                                                \
        fprintf(stderr, "Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__); \
-        LogicError("Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__);      \
+        CNTK::LogicError("Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__);      \
    }
 #endif
 }
@ -144,6 +148,7 @@ namespace CNTK
    class CompositeFunction;
    class Function;
    class Variable;
+    class Axis;

    // Similar to make_shared except that it associates a custom deleter with the shared_ptr to ensure
    // that objects are deleted on the same side of the library DLL where they are allocated
@ -174,4 +179,15 @@ namespace CNTK

    class MinibatchSource;
    typedef std::shared_ptr<MinibatchSource> MinibatchSourcePtr;
+
+    namespace Internal
+    {
+        CNTK_API FunctionPtr PackedIndex(const Variable& operand, const Variable& index, const std::wstring& name = L"");
+        CNTK_API FunctionPtr GatherPacked(const Variable& operand, const Variable& packedIndex, const std::wstring& name = L"");
+        CNTK_API FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Where(const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name = L"");
+        CNTK_API FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name = L"");
+    }
 }
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -15,6 +15,7 @@
 #include "RecurrentNodes.h"
 #include "EvaluationNodes.h"
 #include "TrainingNodes.h"
+#include "ReshapingNodes.h"

 using namespace Microsoft::MSR::CNTK;

@ -32,6 +33,7 @@ namespace CNTK

        Variable var;
        NDShape varShape = AsNDShape(node->GetSampleLayout());
+
        // The CNTK sample layouts may have trailing axes with dimension size of 1 which are automatically
        // added when converting from NDShape to CNTK internal TensorShapes and are not present in the original
        // shapes specified by the user. These should be truncated.
@ -57,11 +59,10 @@ namespace CNTK
                if (node->HasMBLayout())
                {
                    // TODO: Currently only default dynamic axis is supported
-                    const std::wstring defaultCNTKDynamicAxisName = L"";
-                    if (inputNode->GetRequestedDynamicAxis() != defaultCNTKDynamicAxisName)
-                        LogicError("Found dynamic axis named '%S' while currently only default dynamic axis named '%S' is supported!", node->GetMBLayout()->GetAxisName(), defaultCNTKDynamicAxisName.c_str());
+                    auto inputNodeInternalDynamicAxisName = inputNode->GetRequestedDynamicAxis();
+                    std::vector<Axis> inputVarDynamicAxes = DynamicAxesFromInternalDynamicAxisName(inputNodeInternalDynamicAxisName);

-                    var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, node->GetName());
+                    var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, node->GetName(), inputVarDynamicAxes);
                }
                else
                {
@ -121,6 +122,40 @@ namespace CNTK
                opType = PrimitiveOpType::Reciprocal;
            else if (node->OperationName() == OperationNameOf(SoftmaxNode))
                opType = PrimitiveOpType::Softmax;
+            else if (node->OperationName() == OperationNameOf(HardmaxNode))
+                opType = PrimitiveOpType::Hardmax;
+            else if (node->OperationName() == OperationNameOf(TransposeDimensionsNode))
+            {
+                auto transposeDimensionsNode = node->As<TransposeDimensionsNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"axis1"] = (size_t)transposeDimensionsNode->Axis1();
+                primitiveFunctionConfigParameters[L"axis2"] = (size_t)transposeDimensionsNode->Axis2();
+
+                opType = PrimitiveOpType::TransposeAxes;
+            }
+            else if (node->OperationName() == OperationNameOf(WhereNode))
+            {
+                auto whereNode = node->As<WhereNode<ElementType>>();
+                auto internalDynamicAxisName = whereNode->DynamicAxisName();
+                std::vector<Axis> dynamicAxes = DynamicAxesFromInternalDynamicAxisName(internalDynamicAxisName);
+                std::vector<std::wstring> dynamicAxesNames;
+                for (auto axis : dynamicAxes)
+                    dynamicAxesNames.push_back(axis.Name());
+
+                primitiveFunctionConfigParameters[L"newDynamicAxes"] = AsDictionaryValueVector(dynamicAxesNames);
+
+                opType = PrimitiveOpType::Where;
+            }
+            else if (node->OperationName() == OperationNameOf(SliceNode))
+            {
+                auto sliceNode = node->As<SliceNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"axis"] = Axis(sliceNode->Axis() - 1).Name();
+                primitiveFunctionConfigParameters[L"beginIndex"] = sliceNode->BeginIndex();
+                primitiveFunctionConfigParameters[L"endIndex"] = sliceNode->EndIndex();
+
+                opType = PrimitiveOpType::Slice;
+            }
+            else if (node->OperationName() == OperationNameOf(SumElementsNode))
+                opType = PrimitiveOpType::SumAll;
            else if (node->OperationName() == OperationNameOf(PlusNode))
                opType = PrimitiveOpType::Plus;
            else if (node->OperationName() == OperationNameOf(MinusNode))
@ -139,11 +174,23 @@ namespace CNTK
                opType = PrimitiveOpType::Greater;
            else if (node->OperationName() == OperationNameOf(GreaterEqualNode))
                opType = PrimitiveOpType::GreaterEqual;
+            else if (node->OperationName() == OperationNameOf(PackedIndexNode))
+                opType = PrimitiveOpType::PackedIndex;
+            else if (node->OperationName() == OperationNameOf(GatherPackedNode))
+            {
+                std::swap(inputVars[0], inputVars[1]);
+                opType = PrimitiveOpType::GatherPacked;
+            }
            else if (node->OperationName() == OperationNameOf(TimesNode))
            {
-                primitiveFunctionConfigParameters[L"numOutputAxes"] = DictionaryValue((size_t)node->As<TimesNode<ElementType>>()->OutputRank());
+                primitiveFunctionConfigParameters[L"numOutputAxes"] = (size_t)node->As<TimesNode<ElementType>>()->OutputRank();
                opType = PrimitiveOpType::Times;
            }
+            else if (node->OperationName() == OperationNameOf(TransposeTimesNode))
+            {
+                primitiveFunctionConfigParameters[L"numOutputAxes"] = (size_t)node->As<TransposeTimesNode<ElementType>>()->OutputRank();
+                opType = PrimitiveOpType::TransposeTimes;
+            }
            else if (node->OperationName() == OperationNameOf(PastValueNode))
            {
                if (inputVars.size() == 1)
@ -151,7 +198,7 @@ namespace CNTK
                    auto initialStateVar = Constant({}, node->As<PastValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
                    inputVars.insert(inputVars.begin(), initialStateVar);
                }
-                primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<PastValueNode<ElementType>>()->TimeStep());
+                primitiveFunctionConfigParameters[L"stepSize"] = (size_t)node->As<PastValueNode<ElementType>>()->TimeStep();
                opType = PrimitiveOpType::PastValue;
            }
            else if (node->OperationName() == OperationNameOf(FutureValueNode))
@ -161,7 +208,7 @@ namespace CNTK
                    auto initialStateVar = Constant({}, node->As<FutureValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
                    inputVars.insert(inputVars.begin(), initialStateVar);
                }
-                primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<FutureValueNode<ElementType>>()->TimeStep());
+                primitiveFunctionConfigParameters[L"stepSize"] = (size_t)node->As<FutureValueNode<ElementType>>()->TimeStep();
                opType = PrimitiveOpType::FutureValue;
            }
            else if (node->OperationName() == OperationNameOf(SquareErrorNode))
@ -176,8 +223,14 @@ namespace CNTK
                std::swap(inputVars[0], inputVars[1]);
                opType = PrimitiveOpType::ClassificationError;
            }
-            else if (node->OperationName() == OperationNameOf(SumElementsNode))
-                opType = PrimitiveOpType::ReduceSum;
+            else if (node->OperationName() == OperationNameOf(ReduceElementsNode))
+            {
+                auto reduceElementsNode = node->As<ReduceElementsNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"CNTKInternalReductionAxisIndex"] = (size_t)reduceElementsNode->ReductionAxis();
+                primitiveFunctionConfigParameters[L"ReductionOpName"] = reduceElementsNode->ReductionOpName();
+
+                opType = PrimitiveOpType::ReduceElements;
+            }
            else if (node->OperationName() == OperationNameOf(ConvolutionNode))
            {
                auto convolutionNode = node->As<ConvolutionNode<ElementType>>();
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -14,21 +14,17 @@ namespace CNTK
        return GPUDevice(0);
    }

+    /*static*/ const std::wstring Axis::s_staticAxisNamePrefix = L"staticAxis_";
+
    /*static*/ const Axis& Axis::DefaultDynamicAxis()
    {
        static Axis s_defaultDynamicAxis(L"defaultDynamicAxis");
        return s_defaultDynamicAxis;
    }

-    /*static*/ const Axis& Axis::BatchAxis()
+    /*static*/ const Axis& Axis::DefaultBatchAxis()
    {
-        static Axis s_batchAxis(L"batchAxis");
+        static Axis s_batchAxis(L"defaultBatchAxis");
        return s_batchAxis;
    }
-
-    /*static*/ const Axis& Axis::AllAxes()
-    {
-        static Axis s_allAxes(L"allAxes");
-        return s_allAxes;
-    }
 }
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -10,6 +10,10 @@
 #include "Utils.h"
 #include "ComputationNode.h"
 #include "ReshapingNodes.h"
+#include "EvaluationNodes.h"
+#include "TrainingNodes.h"
+#include "LinearAlgebraNodes.h"
+#include "InputAndParamNodes.h"

 using namespace Microsoft::MSR::CNTK;

@ -72,6 +76,17 @@ namespace CNTK
        }
    }

+    /*static*/ const std::wstring PrimitiveFunction::InternalSumReductionOpName = L"Sum";
+    /*static*/ const std::wstring PrimitiveFunction::InternalLogSumReductionOpName = L"LogSum";
+    /*static*/ const std::wstring PrimitiveFunction::InternalMeanReductionOpName = L"Mean";
+    /*static*/ const std::wstring PrimitiveFunction::InternalMaxReductionOpName = L"Max";
+    /*static*/ const std::wstring PrimitiveFunction::InternalMinReductionOpName = L"Min";
+    /*static*/ const std::wstring PrimitiveFunction::InternalAllReductionOpName = L"All";
+    /*static*/ const std::wstring PrimitiveFunction::InternalAnyReductionOpName = L"Any";
+
+    /*static*/ std::wstring CompositeFunction::s_internalDefaultDynamicAxisName = L"";
+    /*static*/ std::wstring CompositeFunction::s_internalNoSequenceAxisName = L"noSequenceAxis";
+
    // Replace any PlaceHolder Variables in the graph of Functions underlying 'this' CompositeFunction. All PlaceHolder variables
    // should have been replaced before performing any Forward compute of 'this' Function.
    /*virtual*/ void CompositeFunction::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
@ -122,22 +137,46 @@ namespace CNTK
                computationNodePtr->SetLearningRateMultiplier(0.0);

            NDArrayViewPtr value = variable.IsConstant() ? Constant(variable).Value() : Parameter(variable).Value();
-            auto matrix = variable.IsConstant() ? value->GetMatrix<ElementType>()->AsReference() : value->GetWritableMatrix<ElementType>()->AsReference();
-            computationNodePtr->Value() = std::move(matrix);
+            std::shared_ptr<const Matrix<ElementType>> valueMatrix = variable.IsConstant() ? value->GetMatrix<ElementType>() : value->GetWritableMatrix<ElementType>();
+            if (variable.IsParameter() || (valueMatrix->GetDeviceId() == network->GetDeviceId()))
+                computationNodePtr->Value() = valueMatrix->AsReference();
+            else
+            {
+                Matrix<ElementType> clonedMatrix(valueMatrix->GetNumRows(), valueMatrix->GetNumCols(), network->GetDeviceId(), valueMatrix->GetMatrixType(), valueMatrix->GetFormat());
+                clonedMatrix.AssignValuesOf(*valueMatrix);
+                computationNodePtr->Value() = std::move(clonedMatrix);
+            }
        }
        else if (variable.IsInput())
        {
-            // TODO: Support inputs with > 1 dynamic axes
-            if (variable.DynamicAxes().size() != 1)
-                LogicError("Currently only Input variables with one dynamic axis are supported");
+            // TODO: Input variables currently are required to have the default batch axis
+            auto dynamicAxes = variable.DynamicAxes();
+            auto foundDefaultBatchAxis = std::find(dynamicAxes.begin(), dynamicAxes.end(), Axis::DefaultBatchAxis());
+            if (foundDefaultBatchAxis == dynamicAxes.end())
+                LogicError("Currently Input Variables are required to have the DefaultBatchAxis as one of their dynamic axes");

-            auto dynamicAxis = variable.DynamicAxes()[0];
-            if (dynamicAxis != Axis::DefaultDynamicAxis())
-                LogicError("Currently only Input variables with DefaultDynamicAxis are supported");
-            if (IsSparseInput(variable))
-                computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()));
+            if (dynamicAxes.back() != Axis::DefaultBatchAxis())
+                LogicError("Currently Input Variables are required to have the DefaultBatchAxis as their last dynamic axes");
+
+            // TODO: Support inputs with > 1 dynamic axes
+            if ((dynamicAxes.size() < 1) || (dynamicAxes.size() > 2))
+                LogicError("Currently only Input variables with 1 or 2 dynamic axis are supported");
+
+            std::wstring internalDynamicAxisName;
+            if (dynamicAxes.size() == 1)
+                internalDynamicAxisName = s_internalNoSequenceAxisName;
+            else if (dynamicAxes[0] == Axis::DefaultDynamicAxis())
+                internalDynamicAxisName = s_internalDefaultDynamicAxisName;
            else
-                computationNodePtr = builder.CreateInputNode(variable.Name(), AsTensorShape(variable.Shape()));
+                internalDynamicAxisName = dynamicAxes[0].Name();
+
+            if (!internalDynamicAxisName.empty())
+                network->AddNodeToNetAndAttachInputs(New<DynamicAxisNode<ElementType>>(network->GetDeviceId(), internalDynamicAxisName), {});
+
+            if (IsSparseInput(variable))
+                computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()), internalDynamicAxisName);
+            else
+                computationNodePtr = builder.CreateInputNode(variable.Name(), AsTensorShape(variable.Shape()), internalDynamicAxisName);

            if (variable.NeedsGradient())
            {
@ -219,11 +258,29 @@ namespace CNTK
                computationNodePtr = builder.Reciprocal(input0Node, function->Name());
                break;
            case PrimitiveOpType::Softmax:
-                if (functionInputs[0].Shape().NumAxes() > 1)
-                    InvalidArgument("Softmax operation can only be applied to a 1D input");
-
                computationNodePtr = builder.Softmax(input0Node, function->Name());
                break;
+            case PrimitiveOpType::Hardmax:
+                computationNodePtr = builder.Hardmax(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::TransposeAxes:
+            {
+                auto axis1 = Axis(functionConfig[L"axis1"].GetValue<std::wstring>());
+                auto axis2 = Axis(functionConfig[L"axis2"].GetValue<std::wstring>());
+
+                // The axis ids passed to the internal CNTK TransposeDimensionsNode are 1 based instead of 0 based
+                computationNodePtr = New<TransposeDimensionsNode<ElementType>>(network->GetDeviceId(), function->Name(), (int)(axis1.StaticAxisIndex() + 1), (int)(axis2.StaticAxisIndex() + 1));
+                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input0Node });
+                break;
+            }
+            case PrimitiveOpType::Where:
+            {
+                auto dynamicAxes = variable.DynamicAxes();
+                auto internalCNTKWhereNodeDynamicAxisName = (dynamicAxes == std::vector<Axis>({ Axis::DefaultBatchAxis() })) ? CompositeFunction::s_internalNoSequenceAxisName : dynamicAxes[0].Name();
+                computationNodePtr = New<WhereNode<ElementType>>(network->GetDeviceId(), function->Name(), internalCNTKWhereNodeDynamicAxisName);
+                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input0Node });
+                break;
+            }
            case PrimitiveOpType::Pooling:
            {
                PoolingType poolingType = (PoolingType)(functionConfig[L"poolingType"].GetValue<size_t>());
@ -235,6 +292,9 @@ namespace CNTK
                computationNodePtr = builder.Pooling(input0Node, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape, true), AsTensorShape(strides, true), autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), ImageLayoutKind::CHW, function->Name());
                break;
            }
+            case PrimitiveOpType::SumAll:
+                computationNodePtr = builder.Sum(input0Node, function->Name());
+                break;
            case PrimitiveOpType::Plus:
                computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
                break;
@ -268,6 +328,12 @@ namespace CNTK
                computationNodePtr = builder.Times(input0Node, input1Node, numOutputAxes, function->Name());
                break;
            }
+            case PrimitiveOpType::TransposeTimes:
+            {
+                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
+                computationNodePtr = network->AddNodeToNetAndAttachInputs(New<TransposeTimesNode<ElementType>>(network->GetDeviceId(), function->Name(), numOutputAxes), { input0Node, input1Node });
+                break;
+            }
            case PrimitiveOpType::Convolution:
            {
                NDShape outputMapCount, kernelShape;
@ -296,35 +362,25 @@ namespace CNTK
            {
                Variable initialStateVar = functionInputs[0];
                Variable inputOperandVar = functionInputs[1];
-                // TODO: Current we only support a scalar initial state
-                if (!initialStateVar.IsConstant() || (initialStateVar.Shape().NumAxes() > 0))
-                    LogicError("Currently PastValue/FutureValue Function only supports scalar initial state");
-
-                // TODO: We currently only support input operand with 1 static axis for PastValue/FutureValue
-                if (inputOperandVar.Shape().NumAxes() != 1)
-                    LogicError("Currently PastValue/FutureValue Function only supports input operand with 1 static axis");
-
-                // TODO: We currently only support input operand with 1 dynamic axis for PastValue/FutureValue
-                if (inputOperandVar.DynamicAxes().size() != 1)
-                    LogicError("Currently PastValue/FutureValue Function only supports input operand with 1 dynamic axis");

                // Get the intial state of the PastValue/FutureValue operation
                ElementType initStateValue;
                NDArrayView tempView({}, &initStateValue, 1, DeviceDescriptor::CPUDevice());
                tempView.CopyFrom(*Constant(initialStateVar).Value());

+                size_t stepSize = primitiveFunction->FunctionConfig()[L"stepSize"].GetValue<size_t>();
                if (op == PrimitiveOpType::PastValue)
-                    computationNodePtr = builder.PastValue(input1Node, (float)initStateValue, inputOperandVar.Shape()[0], primitiveFunction->FunctionConfig()[L"stepSize"].GetValue<size_t>(), function->Name());
+                    computationNodePtr = builder.PastValue(input1Node, (float)initStateValue, inputOperandVar.Shape().TotalSize(), stepSize, function->Name());
                else
-                    computationNodePtr = builder.FutureValue(input1Node, (float)initStateValue, inputOperandVar.Shape()[0], primitiveFunction->FunctionConfig()[L"stepSize"].GetValue<size_t>(), function->Name());
+                    computationNodePtr = builder.FutureValue(input1Node, (float)initStateValue, inputOperandVar.Shape().TotalSize(), stepSize, function->Name());

                break;
            }
-            case PrimitiveOpType::ReduceSum:
+            case PrimitiveOpType::ReduceElements:
            {
-                // TODO: Use the new ReduceElements node instead of the legacy SumElements node for reduction. Currently ReduceElements has incorrect MBLayout inference.
-                //computationNodePtr = network->AddNodeToNetAndAttachInputs(New<ReduceElementsNode<ElementType>>(network->GetDeviceId(), function->Name(), L"Sum", 0), { input0Node });
-                computationNodePtr = builder.Sum(input0Node, function->Name());
+                auto CNTKInternalReductionAxisIndex = (int)functionConfig[L"CNTKInternalReductionAxisIndex"].GetValue<size_t>();
+                auto reductionOpName = functionConfig[L"ReductionOpName"].GetValue<std::wstring>();
+                computationNodePtr = network->AddNodeToNetAndAttachInputs(New<ReduceElementsNode<ElementType>>(network->GetDeviceId(), function->Name(), reductionOpName, CNTKInternalReductionAxisIndex), { input0Node });
                break;
            }
            case PrimitiveOpType::BatchNormalization:
@ -353,6 +409,25 @@ namespace CNTK
                computationNodePtr = variableToNodeMap[variable];

                break;
+            case PrimitiveOpType::PackedIndex:
+                computationNodePtr = New<PackedIndexNode<ElementType>>(network->GetDeviceId(), function->Name());
+                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input0Node, input1Node });
+                break;
+            case PrimitiveOpType::GatherPacked:
+                computationNodePtr = New<GatherPackedNode<ElementType>>(network->GetDeviceId(), function->Name());
+                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input1Node, input0Node });
+                break;
+            case PrimitiveOpType::Slice:
+            {
+                auto axis = Axis(functionConfig[L"axis"].GetValue<std::wstring>());
+                int beginIndex = functionConfig[L"beginIndex"].GetValue<size_t>();
+                int endIndex = functionConfig[L"endIndex"].GetValue<size_t>();
+
+                // Internal CNTK SliceNode takes 1 based axis indices instead of 0 based
+                computationNodePtr = New<SliceNode<ElementType>>(network->GetDeviceId(), function->Name(), beginIndex, endIndex, (int)(axis.StaticAxisIndex() + 1));
+                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input0Node });
+                break;
+            }
            default:
                LogicError("Specified op %s not yet supported", PrimitiveOpTypeName(op));
                break;
@ -486,11 +561,11 @@ namespace CNTK
        if (value->Data()->Shape().NumAxes() == var.Shape().NumAxes())
            return{ value->Data()->GetMatrix<ElementType>(), nullptr };

-        if (value->Data()->Shape().NumAxes() != (var.Shape().NumAxes() + var.DynamicAxes().size() + 1))
-            InvalidArgument("Value's number of axes should be larger than the Variable's number of axes by 1 + number of dynamic axes");
+        if (value->Data()->Shape().NumAxes() < (var.Shape().NumAxes() + var.DynamicAxes().size()))
+            InvalidArgument("Value's number of axes should be larger than the Variable's number of axes by number of dynamic axes");

-        if (var.DynamicAxes().size() > 1)
-            LogicError("More than one dynamic axis for a variable is currently unsupported");
+        if (var.DynamicAxes().size() > 2)
+            LogicError("More than 2 dynamic axis for a variable is currently unsupported");

        size_t maxNumTimeSteps = value->Data()->Shape()[var.Shape().NumAxes()];
        size_t numSequences = value->Data()->Shape()[var.Shape().NumAxes() + 1];
@ -618,9 +693,9 @@ namespace CNTK
                sequenceLengths.push_back(sequenceInfo.GetNumTimeSteps());
        }

-        // Reshuffle to data to unpack and uninterleave the CNTK form data
-        // Now generate the gather indices
-        auto shuffledMatrixData = std::make_shared<Matrix<ElementType>>(matrix.GetNumRows(), maxNumTimeSteps * numSequences, matrix.GetDeviceId());
+        // Reshuffle to data to unpack and uninterleave the CNTK form packed data
+        // Now generate the scatter indices
+        auto shuffledMatrixData = std::make_shared<Matrix<ElementType>>(matrix.GetNumRows(), maxNumTimeSteps * numSequences, matrix.GetDeviceId(), matrix.GetMatrixType(), matrix.GetFormat());

        std::vector<size_t> sequencesShorterThanLongestSequence;
        for (size_t i = 0; i < numSequences; ++i)
@ -659,15 +734,15 @@ namespace CNTK
        }

        auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorShape(valueDataShape));
-        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, readOnly, tensorView);
+        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(shuffledMatrixData->GetFormat()), valueDataShape, readOnly, tensorView);
        return MakeSharedObject<Value>(data, mask);
    }

    template <typename ElementType>
    /*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
    {
-        if (var.DynamicAxes().size() > 1)
-            LogicError("More than one dynamic axis for a variable is currently unsupported");
+        if (var.DynamicAxes().size() > 2)
+            LogicError("More than 2 dynamic axis for a variable is currently unsupported");

        if (AsDataType<ElementType>() != var.GetDataType())
            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(var.GetDataType()));
@ -732,7 +807,7 @@ namespace CNTK
        MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
        auto nodeLayout = computationNode->GetMBLayout();
        if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
-            InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
+            InvalidArgument("The layout of the specified gradient Value is incompatible with the layout of the corresponding Variable computed during Forward call");
        computationNode->As<ComputationNode<ElementType>>()->AssignGradient(*CNTKMatrixAndMBLayout.first);
    }

@ -814,12 +889,9 @@ namespace CNTK
        }

        if (varValue == nullptr)
-        {
-            auto data = MakeSharedObject<NDArrayView>(var.GetDataType(), valueShape, AsDeviceDescriptor(computationNode->ValuePtr()->GetDeviceId()));
-            auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
-            varValue = MakeSharedObject<Value>(data, mask);
-        }
-        varValue->CopyFrom(*nodeValue);
+            varValue = nodeValue->DeepClone();
+        else
+            varValue->CopyFrom(*nodeValue);
    }

    void CompositeFunction::GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs)
@ -984,7 +1056,7 @@ namespace CNTK

    FunctionPtr Round(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return Floor(Plus(operand, Constant(NDShape({}), 0.5f)), name);
+        return Floor(Plus(operand, ScalarConstant(operand.GetDataType(), 0.5f)), name);
    }

    FunctionPtr Floor(const Variable& operand, const std::wstring& name/* = L""*/)
@ -1012,6 +1084,71 @@ namespace CNTK
        return UnaryOp(PrimitiveOpType::Softmax, operand, Dictionary(), name);
    }

+    FunctionPtr Hardmax(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::Hardmax, operand, Dictionary(), name);
+    }
+
+    FunctionPtr TransposeAxes(const Variable& operand, const Axis& axis1, const Axis& axis2, const std::wstring& name /*= L""*/)
+    {
+        if (!axis1.IsStaticAxis() || !axis2.IsStaticAxis())
+            LogicError("TransposeAxes currently does not support transposing dynamic axes");
+
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"axis1"] = axis1.Name();
+        additionalProperties[L"axis2"] = axis2.Name();
+        return UnaryOp(PrimitiveOpType::TransposeAxes, operand, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name /*= L""*/)
+    {
+        if ((endIndex - beginIndex) <= 0)
+            InvalidArgument("CNTK::Slice: endIndex (%d) - beginIndex (%d) must be a positive number", endIndex, beginIndex);
+
+        if (axis == Axis::DefaultBatchAxis())
+            LogicError("Slice is currently unsupported along the batch axis");
+
+        if (axis.IsStaticAxis())
+            return Internal::Slice(operand, axis, beginIndex, endIndex, name);
+
+        auto operandAxes = operand.DynamicAxes();
+        auto findAxis = std::find(operandAxes.begin(), operandAxes.end(), axis);
+        if (findAxis == operandAxes.end())
+            InvalidArgument("The specified dynamic axis named %S does not match any of the dynamic axes of the operand", axis.Name().c_str());
+
+        auto beginFlagsLambda = [beginIndex, operand]() {
+            return (beginIndex > 0) ? Minus(ScalarConstant(operand.GetDataType(), 1.0f), Internal::IsWithin(operand, beginIndex)) : Internal::IsWithin(operand, beginIndex);
+        };
+
+        auto endFlagsLambda = [endIndex, operand]() {
+            return (endIndex > 0) ? Internal::IsWithin(operand, endIndex) : Minus(ScalarConstant(operand.GetDataType(), 1.0f), Internal::IsWithin(operand, endIndex));
+        };
+
+        FunctionPtr flags;
+        if (beginIndex == 0)
+            flags = endFlagsLambda();
+        else if (endIndex == 0)
+            flags = beginFlagsLambda();
+        else
+            flags = ElementTimes(beginFlagsLambda(), endFlagsLambda());
+
+        // Since we are slicing along a dynamic axis, the output variable's dynamic axes will be different than the operand
+        std::vector<Axis> newDynamicAxes;
+        for (auto operandAxis : operandAxes)
+        {
+            if (operandAxis == axis)
+            {
+                // If we are selecting just one frame from the dynamic axis, we can remove that axis
+                if ((endIndex - beginIndex) > 1)
+                    newDynamicAxes.push_back(CompositeFunction::NextAutoGeneratedDynamicAxis());
+            }
+            else
+                newDynamicAxes.push_back(operandAxis);
+        }
+
+        return Internal::Gather(operand, flags, newDynamicAxes);
+    }
+
    FunctionPtr BinaryOp(PrimitiveOpType op, const Variable& leftOperand, const Variable& rightOperand, Dictionary&& opConfig, const std::wstring& name)
    {
        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(op, std::vector<Variable>({ leftOperand, rightOperand }), std::move(opConfig), name), name);
@ -1074,6 +1211,13 @@ namespace CNTK
        return BinaryOp(PrimitiveOpType::Times, leftOperand, rightOperand, std::move(additionalProperties), name);
    }

+    FunctionPtr TransposeTimes(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes /*= 1*/, const std::wstring& name/* = L""*/)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"numOutputAxes"] = numOutputAxes;
+        return BinaryOp(PrimitiveOpType::TransposeTimes, leftOperand, rightOperand, std::move(additionalProperties), name);
+    }
+
    FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name/* = L""*/)
    {
        return BinaryOp(PrimitiveOpType::SquaredError, prediction, targets, Dictionary(), name);
@ -1081,18 +1225,20 @@ namespace CNTK

    FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return BinaryOp(PrimitiveOpType::CrossEntropyWithSoftmax, prediction, labels, Dictionary(), name);
+        return ReduceSum(Minus(ReduceLogSum(prediction, Axis(0)), TransposeTimes(labels, prediction)), name);
+        //return BinaryOp(PrimitiveOpType::CrossEntropyWithSoftmax, prediction, labels, Dictionary(), name);
    }

    FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return BinaryOp(PrimitiveOpType::ClassificationError, prediction, labels, Dictionary(), name);
+        return ReduceSum(Minus(ScalarConstant(prediction.GetDataType(), 1.0f), TransposeTimes(labels, Hardmax(prediction))), name);
+        //return BinaryOp(PrimitiveOpType::ClassificationError, prediction, labels, Dictionary(), name);
    }

    FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
    {
-        if (operand.DynamicAxes().size() != 1)
-            InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
+        if (operand.DynamicAxes().size() != 2)
+            InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic sequence-axis");

        auto additionalProperties = Dictionary();
        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
@ -1101,8 +1247,8 @@ namespace CNTK

    FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
    {
-        if (operand.DynamicAxes().size() != 1)
-            InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
+        if (operand.DynamicAxes().size() != 2)
+            InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic sequence-axis");

        auto additionalProperties = Dictionary();
        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
@ -1111,7 +1257,17 @@ namespace CNTK

    FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return UnaryOp(PrimitiveOpType::ReduceSum, operand, Dictionary(), name);
+        return UnaryOp(PrimitiveOpType::SumAll, operand, Dictionary(), name);
+    }
+
+    FunctionPtr ReduceSum(const Variable& operand, const Axis& axis, const std::wstring& name/* = L""*/)
+    {
+        return Internal::ReduceElements(operand, PrimitiveFunction::InternalSumReductionOpName, axis, name);
+    }
+
+    FunctionPtr ReduceLogSum(const Variable& operand, const Axis& axis, const std::wstring& name/* = L""*/)
+    {
+        return Internal::ReduceElements(operand, PrimitiveFunction::InternalLogSumReductionOpName, axis, name);
    }

    FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name /*= L""*/)
@ -1207,4 +1363,94 @@ namespace CNTK

        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
    }
+
+    namespace Internal
+    {
+        FunctionPtr PackedIndex(const Variable& operand, const Variable& index, const std::wstring& name /*= L""*/)
+        {
+            return BinaryOp(PrimitiveOpType::PackedIndex, operand, index, Dictionary(), name);
+        }
+
+        FunctionPtr GatherPacked(const Variable& operand, const Variable& packedIndex, const std::wstring& name /*= L""*/)
+        {
+            return BinaryOp(PrimitiveOpType::GatherPacked, operand, packedIndex, Dictionary(), name);
+        }
+
+        FunctionPtr ZeroesLike(const Variable& operand)
+        {
+            if (operand.Shape().NumAxes() > 1)
+                LogicError("ZerosLike currently does not support operands with more than 1 static axes");
+
+            auto rowSliceFunc = Internal::Slice(operand, Axis(0), 0, 1);
+            return Minus(rowSliceFunc, rowSliceFunc);
+        }
+
+        FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name /*= L""*/)
+        {
+            if (offset == 0)
+                InvalidArgument("Internal::CNTK::IsWithin: The offset must be positive");
+
+            if (offset > 0)
+                return PastValue(ScalarConstant(operand.GetDataType(), 1.0f), ZeroesLike(operand), offset, name);
+            else
+                return FutureValue(ScalarConstant(operand.GetDataType(), 1.0f), ZeroesLike(operand), -offset, name);
+        }
+
+        FunctionPtr Where(const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name /*= L""*/)
+        {
+            auto additionalProperties = Dictionary();
+            std::vector<std::wstring> newDynamicAxesNames;
+            for (auto axis : newDynamicAxes)
+                newDynamicAxesNames.push_back(axis.Name());
+
+            additionalProperties[L"newDynamicAxes"] = AsDictionaryValueVector(newDynamicAxesNames);
+            return UnaryOp(PrimitiveOpType::Where, condition, std::move(additionalProperties), name);
+        }
+
+        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name /*= L""*/)
+        {
+            return Internal::GatherPacked(operand, Internal::PackedIndex(operand, Where(condition, newDynamicAxes)));
+        }
+
+        FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name /*= L""*/)
+        {
+            auto additionalProperties = Dictionary();
+            additionalProperties[L"axis"] = axis.Name();
+            additionalProperties[L"beginIndex"] = (size_t)beginIndex;
+            additionalProperties[L"endIndex"] = (size_t)endIndex;
+
+            return UnaryOp(PrimitiveOpType::Slice, operand, std::move(additionalProperties), name);
+        }
+
+        FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name /*= L""*/)
+        {
+            using namespace std::placeholders;
+
+            if (axis.IsStaticAxis())
+            {
+                auto additionalProperties = Dictionary();
+                additionalProperties[L"CNTKInternalReductionAxisIndex"] = (size_t)(axis.StaticAxisIndex() + 1);
+                additionalProperties[L"ReductionOpName"] = reductionOpName;
+                return UnaryOp(PrimitiveOpType::ReduceElements, operand, std::move(additionalProperties), name);
+            }
+
+            if (axis == Axis::DefaultBatchAxis())
+                LogicError("Reduction is currently unsupported along the batch axis");
+
+            if (reductionOpName != PrimitiveFunction::InternalSumReductionOpName)
+                LogicError("%S reduction along dynamic axis is currently unsupported", reductionOpName.c_str());
+
+            std::function<FunctionPtr(const Variable& leftOperand, const Variable& rightOperand)> reductionFunctor;
+            if (reductionOpName == PrimitiveFunction::InternalSumReductionOpName)
+                reductionFunctor = std::bind(Plus, _1, _2, L"");
+
+            // We are reducing over a dynamic axis which is currently implemented using recurrence
+            auto cumulativeSumFunctionPlaceholder = Placeholder(operand.Shape());
+            auto prevAccumulatedValuesFunction = PastValue(ScalarConstant(operand.GetDataType(), 0.0f), cumulativeSumFunctionPlaceholder, 1);
+            auto cumulativeSumFunction = reductionFunctor(prevAccumulatedValuesFunction, operand);
+            cumulativeSumFunction->ReplacePlaceholders({ { cumulativeSumFunctionPlaceholder, cumulativeSumFunction } });
+
+            return CNTK::Slice(cumulativeSumFunction, axis, -1, 0);
+        }
+   }
 }
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -27,7 +27,12 @@ namespace CNTK
        Abs,
        Reciprocal,
        Softmax,
+        Hardmax,
+        TransposeAxes,
+        Where,
+        Slice,
        Pooling,
+        SumAll,
        Plus,
        Minus,
        ElementTimes,
@ -37,14 +42,17 @@ namespace CNTK
        LessEqual,
        Greater,
        GreaterEqual,
+        PackedIndex,
+        GatherPacked,
        Times,
+        TransposeTimes,
        Convolution,
        SquaredError,
        CrossEntropyWithSoftmax,
        ClassificationError,
        PastValue,
        FutureValue,
-        ReduceSum,
+        ReduceElements,
        BatchNormalization,
        Combine,
    };
@ -77,7 +85,12 @@ namespace CNTK
            { PrimitiveOpType::Abs, "Abs" },
            { PrimitiveOpType::Reciprocal, "Reciprocal" },
            { PrimitiveOpType::Softmax, "Softmax" },
+            { PrimitiveOpType::Hardmax, "Hardmax" },
+            { PrimitiveOpType::TransposeAxes, "TransposeAxes" },
+            { PrimitiveOpType::Where, "Where" },
+            { PrimitiveOpType::Slice, "Slice" },
            { PrimitiveOpType::Pooling, "Pooling" },
+            { PrimitiveOpType::SumAll, "SumAll" },
            { PrimitiveOpType::Plus, "Plus" },
            { PrimitiveOpType::Minus, "Minus" },
            { PrimitiveOpType::ElementTimes, "ElementTimes" },
@ -87,14 +100,17 @@ namespace CNTK
            { PrimitiveOpType::LessEqual, "LessEqual" },
            { PrimitiveOpType::Greater, "Greater" },
            { PrimitiveOpType::GreaterEqual, "GreaterEqual" },
+            { PrimitiveOpType::PackedIndex, "PackedIndex" },
+            { PrimitiveOpType::GatherPacked, "GatherPacked" },
            { PrimitiveOpType::Times, "Times" },
+            { PrimitiveOpType::TransposeTimes, "TransposeTimes" },
            { PrimitiveOpType::Convolution, "Convolution" },
            { PrimitiveOpType::SquaredError, "SquaredError" },
            { PrimitiveOpType::CrossEntropyWithSoftmax, "CrossEntropyWithSoftmax" },
            { PrimitiveOpType::ClassificationError, "ClassificationError" },
            { PrimitiveOpType::PastValue, "PastValue" },
            { PrimitiveOpType::FutureValue, "FutureValue" },
-            { PrimitiveOpType::ReduceSum, "ReduceSum" },
+            { PrimitiveOpType::ReduceElements, "ReduceElements" },
            { PrimitiveOpType::BatchNormalization, "BatchNormalization" },
            { PrimitiveOpType::Combine, "Combine" }
        };
@ -107,6 +123,15 @@ namespace CNTK

    class PrimitiveFunction final : public Function
    {
+    public:
+        static const std::wstring InternalSumReductionOpName;
+        static const std::wstring InternalLogSumReductionOpName;
+        static const std::wstring InternalMeanReductionOpName;
+        static const std::wstring InternalMaxReductionOpName;
+        static const std::wstring InternalMinReductionOpName;
+        static const std::wstring InternalAllReductionOpName;
+        static const std::wstring InternalAnyReductionOpName;
+
    public:
        PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName = L"")
            : Function(inputs, GetOutputVariables(op, inputs, this, functionConfig), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
@ -242,16 +267,26 @@ namespace CNTK
            DataType outputDataType = inputs[0].GetDataType();

            // We currently require that the inputs' dynamic axes if any match
-            std::vector<Axis> outputDynamicAxes = inputs[0].DynamicAxes();
-            for (auto inputVar : inputs)
+            std::vector<Axis> outputDynamicAxes;
+            if (op == PrimitiveOpType::Where)
+                ;
+            else if ((op == PrimitiveOpType::PackedIndex) || (op == PrimitiveOpType::GatherPacked))
            {
-                auto currentInputDynamicAxes = inputVar.DynamicAxes();
-                if (outputDynamicAxes.empty())
-                    outputDynamicAxes = currentInputDynamicAxes;
-                else
+                outputDynamicAxes = inputs[1].DynamicAxes();
+            }
+            else
+            {
+                outputDynamicAxes = inputs[0].DynamicAxes();
+                for (auto inputVar : inputs)
                {
-                    if (!currentInputDynamicAxes.empty() && (currentInputDynamicAxes != outputDynamicAxes))
-                        LogicError("Currently if an operand of a binary elementwise operation has any dynamic axes, those must match the dynamic axes of the other operand");
+                    auto currentInputDynamicAxes = inputVar.DynamicAxes();
+                    if (outputDynamicAxes.empty())
+                        outputDynamicAxes = currentInputDynamicAxes;
+                    else
+                    {
+                        if (!currentInputDynamicAxes.empty() && (currentInputDynamicAxes != outputDynamicAxes))
+                            LogicError("Currently if an operand of a binary elementwise operation has any dynamic axes, those must match the dynamic axes of the other operand");
+                    }
                }
            }

@ -268,9 +303,38 @@ namespace CNTK
            case PrimitiveOpType::Abs:
            case PrimitiveOpType::Reciprocal:
            case PrimitiveOpType::Softmax:
+            case PrimitiveOpType::Hardmax:
                assert(inputs.size() == 1);
+                if (((op == PrimitiveOpType::Softmax) || (op == PrimitiveOpType::Hardmax)) && (inputs[0].Shape().NumAxes() > 1))
+                    InvalidArgument("Softmax/Hardmax operation can only be applied to a 1D input");
+
                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
+            case PrimitiveOpType::TransposeAxes:
+            {
+                assert(inputs.size() == 1);
+                auto axis1 = Axis(functionConfig[L"axis1"].GetValue<std::wstring>());
+                auto axis2 = Axis(functionConfig[L"axis2"].GetValue<std::wstring>());
+
+                if (!axis1.IsStaticAxis() || !axis2.IsStaticAxis())
+                    LogicError("TransposeAxes operation currently does not support transposing dynamic axes");
+
+                auto transposedTensorShape = AsTensorShape(inputs[0].Shape(), true);
+                transposedTensorShape.SwapDimsInPlace(axis1.StaticAxisIndex(), axis2.StaticAxisIndex());
+                outputs.push_back(Variable(AsNDShape(transposedTensorShape), outputDataType, owner, outputDynamicAxes));
+                break;
+            }
+            case PrimitiveOpType::Where:
+            {
+                assert(inputs.size() == 1);
+                std::vector<Axis> newDynamicAxes;
+                auto newDynamicAxesNames = AsBasicElementTypeVector<std::wstring>(functionConfig[L"newDynamicAxes"].GetValue<std::vector<DictionaryValue>>());
+                for (auto axisName : newDynamicAxesNames)
+                    newDynamicAxes.push_back(Axis(axisName));
+
+                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, newDynamicAxes));
+                break;
+            }
            case PrimitiveOpType::Pooling:
            {
                assert(inputs.size() == 1);
@ -282,6 +346,10 @@ namespace CNTK
                outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[0].Shape(), poolingWindowsShape, { 1 }, strides, { true }, autoPadding, lowerPad, upperPad, false), outputDataType, owner, outputDynamicAxes));
                break;
            }
+            case PrimitiveOpType::SumAll:
+                assert(inputs.size() == 1);
+                outputs.push_back(Variable({}, outputDataType, owner, std::vector<Axis>({})));
+                break;
            case PrimitiveOpType::Plus:
            case PrimitiveOpType::Minus:
            case PrimitiveOpType::ElementTimes:
@ -297,15 +365,26 @@ namespace CNTK
            case PrimitiveOpType::Times:
            {
                assert(inputs.size() == 2);
-
-                // TODO: Support dynamic axes on the left operand
-                if (!inputs[0].DynamicAxes().empty())
-                    LogicError("Dynamic axes are currently unsupported for left operand of a Times operation");
-
                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
                outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape(), numOutputAxes), outputDataType, owner, outputDynamicAxes));
                break;
            }
+            case PrimitiveOpType::TransposeTimes:
+            {
+                assert(inputs.size() == 2);
+
+                auto numLeftOperandAxes = inputs[0].Shape().NumAxes();
+                if (numLeftOperandAxes > 2)
+                    InvalidArgument("TransposeTimes operation only supports left operands of rank 1 or 2");
+
+                NDShape transposedLeftOperandShape(2, 1);
+                for (size_t i = 0; i < numLeftOperandAxes; ++i)
+                    transposedLeftOperandShape[transposedLeftOperandShape.NumAxes() - i - 1] = inputs[0].Shape()[i];
+
+                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
+                outputs.push_back(Variable(TimesOpOutputShape(transposedLeftOperandShape, inputs[1].Shape(), numOutputAxes), outputDataType, owner, outputDynamicAxes));
+                break;
+            }
            case PrimitiveOpType::Convolution:
            {
                assert(inputs.size() == 2);
@ -341,26 +420,45 @@ namespace CNTK
                for (size_t i = 0; i < inputs[0].Shape().NumAxes(); ++i)
                    reductionAxes.push_back(i);

-                outputs.push_back(Variable(ReductionOpOutputShape(op, predictionShape, reductionAxes), outputDataType, owner, {}));
+                outputs.push_back(Variable(ReductionOpOutputShape(op, predictionShape, reductionAxes), outputDataType, owner, std::vector<Axis>({})));
                break;
            }
            case PrimitiveOpType::PastValue:
            case PrimitiveOpType::FutureValue:
+            {
                assert(inputs.size() == 2);
+                Variable initialStateVar = inputs[0];
+                Variable inputOperandVar = inputs[1];
+                // TODO: Current we only support a scalar initial state
+                if (!initialStateVar.IsConstant() || (initialStateVar.Shape().NumAxes() > 0))
+                    LogicError("Currently PastValue/FutureValue Function only supports scalar initial state");
+
+                // TODO: We currently only support input operand with 1 static axis for PastValue/FutureValue
+                if (inputOperandVar.Shape().NumAxes() > 1)
+                    LogicError("Currently PastValue/FutureValue Function only supports input operand with <= 1 static axis");
+
+                // TODO: We currently only support input operand with 1 dynamic axis for PastValue/FutureValue
+                if (inputOperandVar.DynamicAxes().size() != 2)
+                    LogicError("Currently PastValue/FutureValue Function only supports input operand with with 2 dynamic axis (1 sequence-axis and 1 batch-axis)");
+
                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
-            case PrimitiveOpType::ReduceSum:
+            }
+            case PrimitiveOpType::ReduceElements:
            {
                assert(inputs.size() == 1);
-
-                // TODO: For reductions, we should remove any of the dynamic axes from 'outputDynamicAxes' that are being reduced over. 
-                // Currently we only support reductions that reduce over all axes
-                std::vector<Axis> reductionOutputDynamicAxes = {};
+                auto CNTKInternalReductionAxisIndex = functionConfig[L"CNTKInternalReductionAxisIndex"].GetValue<size_t>();
                std::vector<size_t> reductionAxes;
-                for (size_t i = 0; i < inputs[0].Shape().NumAxes(); ++i)
-                    reductionAxes.push_back(i);
+                // TODO: Do not use a integer literal for the special value of axis id that indicates all static axes
+                if (CNTKInternalReductionAxisIndex == 0)
+                {
+                    for (size_t i = 0; i < inputs[0].Shape().NumAxes(); ++i)
+                        reductionAxes.push_back(i);
+                }
+                else
+                    reductionAxes.push_back(CNTKInternalReductionAxisIndex - 1);

-                outputs.push_back(Variable(ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes), outputDataType, owner, reductionOutputDynamicAxes));
+                outputs.push_back(Variable(ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes), outputDataType, owner, inputs[0].DynamicAxes()));
                break;
            }
            case PrimitiveOpType::BatchNormalization:
@ -369,6 +467,60 @@ namespace CNTK
            case PrimitiveOpType::Combine:
                outputs = inputs;
                break;
+            case PrimitiveOpType::PackedIndex:
+                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
+                break;
+            case PrimitiveOpType::GatherPacked:
+            {
+                bool sourceHasDynamicAxis = !inputs[0].DynamicAxes().empty();
+                NDShape outputShape;
+
+                // inherit tensor dimension from sourceData, minus the last (column or time) dimension. TODO this needs to become simpler...
+                if (sourceHasDynamicAxis)
+                    outputShape = inputs[0].Shape();
+                else
+                {
+                    if (inputs[0].Shape().NumAxes() > 1)
+                        outputShape = outputShape.SubShape(0, outputShape.NumAxes() - 1);
+                    else
+                        outputShape = {};
+                }
+
+                outputs.push_back(Variable(outputShape, outputDataType, owner, outputDynamicAxes));
+                break;
+            }
+            case PrimitiveOpType::Slice:
+            {
+                auto axis = Axis(functionConfig[L"axis"].GetValue<std::wstring>());
+                int beginIndex = functionConfig[L"beginIndex"].GetValue<size_t>();
+                int endIndex = functionConfig[L"endIndex"].GetValue<size_t>();
+                if (!axis.IsStaticAxis())
+                    LogicError("Built-in Slice operation currently does not support slicing along dynamic axis");
+
+                if (axis.StaticAxisIndex() >= inputs[0].Shape().NumAxes())
+                    InvalidArgument("The specified axis index (%d) for the Slice operation is outside the bounds of the available axes of the input", (int)axis.StaticAxisIndex());
+
+                size_t sliceAxisDim = inputs[0].Shape()[axis.StaticAxisIndex()];
+                int realBeginIndex = (beginIndex >= 0) ? beginIndex : beginIndex + sliceAxisDim;
+                int realEndIndex = (endIndex > 0) ? endIndex : endIndex + sliceAxisDim;
+                if ((sliceAxisDim < realEndIndex) || (realEndIndex < realBeginIndex) || (realBeginIndex < 0))
+                    RuntimeError("Slice operation: Index range [%d,%d), interpreted as [%d,%d), is invalid for input ([%S]).",
+                                 beginIndex,
+                                 endIndex,
+                                 realBeginIndex,
+                                 realEndIndex,
+                                 inputs[0].Shape().AsString().c_str());
+
+                auto outputTensorShape = AsTensorShape(inputs[0].Shape(), true);
+
+                // propagate as much as we can
+                if ((axis.StaticAxisIndex() < outputTensorShape.GetRank()) && (0 <= realBeginIndex) && (realBeginIndex <= realEndIndex) && (realEndIndex <= sliceAxisDim))
+                    outputTensorShape.NarrowTo(axis.StaticAxisIndex(), realBeginIndex, realEndIndex);
+
+
+                outputs.push_back(Variable(AsNDShape(outputTensorShape), outputDataType, owner, outputDynamicAxes));
+                break;
+            }
            default:
                LogicError("Specified op %s not yet supported", PrimitiveOpTypeName(op));
                break;
@ -417,6 +569,17 @@ namespace CNTK
                                                         std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
                                                         const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/);

+    public:
+        static std::wstring s_internalDefaultDynamicAxisName;
+        static std::wstring s_internalNoSequenceAxisName;
+
+        static Axis NextAutoGeneratedDynamicAxis()
+        {
+            static std::atomic<unsigned int> nextAutoGeneratedDynamicAxis(0);
+            static const std::wstring autoGeneratedDynamicAxisNamePrefix = L"autoGeneratedDynamicAxis_";
+            return Axis(autoGeneratedDynamicAxisNamePrefix + std::to_wstring(nextAutoGeneratedDynamicAxis++));
+        }
+
    public:
        static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
        {
@ -524,4 +687,17 @@ namespace CNTK
        // the next 'Backward' call.
        std::unordered_set<Variable> m_currentBackpropRoots;
    };
+
+    inline std::vector<CNTK::Axis> DynamicAxesFromInternalDynamicAxisName(const std::wstring& internalDynamicAxisName)
+    {
+        std::vector<CNTK::Axis> inputVarDynamicAxes;
+        if (internalDynamicAxisName == CNTK::CompositeFunction::s_internalDefaultDynamicAxisName)
+            inputVarDynamicAxes = { CNTK::Axis::DefaultDynamicAxis(), CNTK::Axis::DefaultBatchAxis() };
+        else if (internalDynamicAxisName == CNTK::CompositeFunction::s_internalNoSequenceAxisName)
+            inputVarDynamicAxes = { CNTK::Axis::DefaultBatchAxis() };
+        else
+            inputVarDynamicAxes = { CNTK::Axis(internalDynamicAxisName), CNTK::Axis::DefaultBatchAxis() };
+
+        return inputVarDynamicAxes;
+    }
 }
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -8,19 +8,18 @@
 #include "Utils.h"

 #define UPDATE_FUNCTION                                                                                       \
-    switch (smoothedGradientValue->GetDataType())                                                     \
+    switch (smoothedGradientValue->GetDataType())                                                             \
    {                                                                                                         \
    case DataType::Float:                                                                                     \
-        Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);  \
+        Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                  \
        break;                                                                                                \
    case DataType::Double:                                                                                    \
-        Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
+        Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                 \
        break;                                                                                                \
    default:                                                                                                  \
        NOT_IMPLEMENTED;                                                                                      \
    }

-
 using namespace Microsoft::MSR::CNTK;
 using namespace std;

@ -141,7 +140,7 @@ namespace CNTK
        // L1 regularizer with proximal gradient descent method
        if (m_additionalOptions.l1RegularizationWeight > 0)
        {
-            auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+            auto learningRate = ElementType(m_learningRates[m_sampleCount]);
            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
            auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
            parameterValue->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
@ -154,48 +153,49 @@ namespace CNTK
        return arrayView->GetWritableTensorView<ElementType>();
    }

-    LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters)
+    LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters, 
+                             const LearningRatesPerSample& learningRates,
+                             bool allocateSmoothGradients /* = true */)
        : Learner(parameters),
-        m_learningRatePerSample(0.0),
-        m_sampleCount(0)
+        m_learningRates(learningRates),
+        m_sampleCount(0),
+        m_minibatchCount(0)
    {
-        const unordered_set<Parameter>& parameterSet = parameters;
-        for (const auto& parameter : parameterSet)
+        for (const auto& parameter : parameters)
        {
-            // TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
-            // Should the device be specified on the per-parameter basis?
-            NDArrayViewPtr view;
-            if (parameter.GetDataType() == DataType::Float)
+            if (!allocateSmoothGradients)
            {
-                view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), parameter.Value()->Device());
+                continue;
            }
-            else
-            {
-                view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), parameter.Value()->Device());
-            }
-
+                
+            NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape());
            m_smoothedGradientValues.insert(make_pair(parameter, view));
-            m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
        }
    }

-    void LearnerBase::ResetSmoothedGradients()
+    /*static*/ NDArrayViewPtr LearnerBase::AllocateNDArrayView(const Parameter& parameter, const NDShape& shape) 
    {
-        for (const auto& parameter : Parameters())
+        if (parameter.GetDataType() == DataType::Float)
        {
-            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
-            const auto& data = smoothedGradientValue;
-            switch (data->GetDataType())
-            {
-            case DataType::Float:
-                data->SetValue(0.0f);
-                break;
-            case DataType::Double:
-                data->SetValue(0.0);
-                break;
-            default:
-                LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
-            }
+            return MakeSharedObject<NDArrayView>(float(0.0), shape, parameter.Value()->Device());
+        }
+        else
+        {
+            return MakeSharedObject<NDArrayView>(0.0, shape, parameter.Value()->Device());
+        }
+    }
+
+    /*static*/ NDShape LearnerBase::GetMatrixShape(const Parameter& parameter)
+    {
+        if (parameter.GetDataType() == DataType::Float)
+        {
+           auto matrix = GetMatrix<float>(parameter.Value());
+           return { matrix->GetNumRows(), matrix->GetNumCols() };
+        }
+        else
+        {
+           auto matrix = GetMatrix<double>(parameter.Value());
+           return { matrix->GetNumRows(), matrix->GetNumCols() };
        }
    }

@ -219,17 +219,19 @@ namespace CNTK
 #endif

 #if DUMPOUTPUT
+            auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+            auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
            LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
-                        m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
+                        learningRate, momentum, trainingSampleCount);
            LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
-                        LearnerType().c_str(), m_GaussianNoiseInjectStd);
+                      LearnerType().c_str(), m_additionalOptions.gaussianNoiseInjectionStdDev);
            Print(gradientValue, "Gradient Update");
            Print(smoothedGradientValue, "Smoothed Gradient Input");
 #endif
            UPDATE_FUNCTION;

 #if DUMPOUTPUT
-            Print(parameterValue, "Parameter Update");
+            Print(parameter.Value(), "Parameter Update");
 #endif

 #ifdef _DEBUG
@ -239,6 +241,7 @@ namespace CNTK
 #endif
        }
        m_sampleCount += trainingSampleCount;
+        m_minibatchCount++;
        return false;
    }

@ -265,9 +268,16 @@ namespace CNTK

    /*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
    {
-        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
        Dictionary checkpoint;

+        checkpoint[L"checkpointVersion"] = checkpointVersion;
+        checkpoint[L"sampleCount"] = m_sampleCount;
+        checkpoint[L"minibatchCount"] = m_minibatchCount;
+
+        // TODO: should we also save learning rate schedule into the checkpoint?
+        // If that is the case, need to be able to override this method in subclasses
+        // and save momentum schedule as well.
+
        for (const auto& parameter : Parameters())
        {
            // TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
@ -277,31 +287,48 @@ namespace CNTK
            {
                LogicError("Parameter names must be unique");
            }
-            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);

-            // Potentially, could store things like dimensions, element size, format, etc., but
-            // that seems to be redundant, since all of that is passed in the constructor.
-            checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue);
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            checkpoint[parameter.Name()] = *smoothedGradientValue;
        }
        return checkpoint;
    }

    /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
    {
-        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        m_sampleCount = checkpoint[L"sampleCount"].GetValue<size_t>();
+        m_minibatchCount = checkpoint[L"minibatchCount"].GetValue<size_t>();
+
+        size_t version = checkpoint[L"minibatchCount"].GetValue<size_t>();
+        if (checkpointVersion != version)
+        {
+            // At the moment, we only support one version, so this should never happen.
+            LogicError("Unsupported checkpoint version.");
+        }
+
        for (const auto& parameter : Parameters())
        {
            if (!checkpoint.Contains(parameter.Name()))
            {
                LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
            }
+
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            const NDArrayView& checkpointedValue = checkpoint[parameter.Name()].GetValue<NDArrayView>();
+            
+            if (smoothedGradientValue->GetDataType() != checkpointedValue.GetDataType())
+            {
+                LogicError("A value restored from a checkpoint for the smoothed gradient data type for parameter %ls does not match the expected value",
+                           parameter.Name().c_str());
+            }

-            const DictionaryValue& state = checkpoint[parameter.Name()];
+            if (smoothedGradientValue->Shape() != checkpointedValue.Shape())
+            {
+                LogicError("A value restored from a checkpoint for the smoothed gradient shape for parameter %ls does not match the expected value",
+                           parameter.Name().c_str());
+            }

-            const auto& data = smoothedGradientValue;
-
-            DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
+            smoothedGradientValue->CopyFrom(checkpointedValue);
        }
    }

@ -313,23 +340,25 @@ namespace CNTK
    template <typename ElementType>
    void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
-        UNUSED(trainingSampleCount);
-
        const auto& parameterValue = parameter.Value();
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+        auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));

        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
-                                            learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
+                                           learningRate, momentum, m_useNesterovAcceleration);
    }

-    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
-        : LearnerBase(parameters), m_needAveMultiplier(needAveMultiplier)
+    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, 
+                                   const LearningRatesPerSample& learningRates,
+                                   bool needAveMultiplier)
+        : LearnerBase(parameters, learningRates), 
+        m_needAveMultiplier(needAveMultiplier)
    {
    }

@ -348,15 +377,23 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);

        auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }

-    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters)
-        : LearnerMomentumSGD(parameters)
+    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters, 
+                                       const LearningRatesPerSample& learningRates, 
+                                       const MomentumsPerSample& momentums)
+        : LearnerMomentumSGD(parameters, learningRates, momentums, /*allocateSmoothGradients*/ false)
    {
+        for (const auto& parameter : parameters)
+        {  
+            auto shape = GetMatrixShape(parameter);
+            NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], 2 * shape[1]});
+            m_smoothedGradientValues.insert(make_pair(parameter, view));
+        }
    }

    /*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
@ -373,21 +410,33 @@ namespace CNTK
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
-
-        //const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
-
-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
-
-        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
-                                            learningRate, ElementType(m_momentumPerSample));
+        
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+        auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
+        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix, learningRate, momentum);
    }

-    LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters,
-                                    double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
-                                    : LearnerBase(parameters),
-                                    m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
-                                    m_needAveMultiplier(needAveMultiplier)
+    LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates,
+                                   double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
+                                   : LearnerBase(parameters, learningRates, /*allocateSmoothGradients*/ false),
+                                   m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
+                                   m_needAveMultiplier(needAveMultiplier)
    {
+        for (const auto& parameter : parameters)
+        {  
+            // When needAveMultiplier == true, CPU and GPU implementations of RMSProp require different number of columns.
+            // TODO: verify that this is correct.
+            size_t factor = 3;
+            if (needAveMultiplier && parameter.Value()->Device().Type() == DeviceKind::GPU)
+            {
+                factor = 4;
+            }
+
+            auto shape = GetMatrixShape(parameter);
+            NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], factor * shape[1]});
+
+            m_smoothedGradientValues.insert(make_pair(parameter, view));
+        }
    }

    /*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
@ -405,12 +454,12 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);

        auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
-                                                                ElementType(m_gamma), ElementType(m_inc),
-                                                                ElementType(m_max), ElementType(m_dec),
-                                                                ElementType(m_min), m_needAveMultiplier);
+                                                             ElementType(m_gamma), ElementType(m_inc),
+                                                             ElementType(m_max), ElementType(m_dec),
+                                                             ElementType(m_min), m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }

@ -418,34 +467,35 @@ namespace CNTK
    template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr& arrayView);
    template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr& arrayView);
    
-    LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, double learningRatePerSample)
+    LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates)
    {
-        return MakeSharedObject<LearnerSGD>(parameters, learningRatePerSample);
+        return MakeSharedObject<LearnerSGD>(parameters, learningRates);
    }

-    LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerMomentumSGD>(parameters);
+        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRates, momentums);
    }

-    LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerNesterov>(parameters);
+        return MakeSharedObject<LearnerNesterov>(parameters, learningRates, momentums);
    }

-    LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
+    LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, bool needAveMultiplier)
    {
-        return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier);
+        return MakeSharedObject<LearnerAdaGrad>(parameters, learningRates, needAveMultiplier);
    }

-    LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerFSAdaGrad>(parameters);
+        return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRates, momentums);
    }

-    LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters,
-                                double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
+    LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates,
+                              double gamma, double inc, double dec, double max, double min, 
+                              bool needAveMultiplier)
    {
-        return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier);
+        return MakeSharedObject<LearnerRMSProp>(parameters, learningRates, gamma, inc, dec, max, min, needAveMultiplier);
    }
 }
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -9,6 +9,7 @@

 namespace CNTK 
 {
+    // TODO: Move this to Trainer along with Pre-, PostProcess and ClipGradient.
    // A collection of additional options that are applicable for all standard learners 
    // (after these options are set, they retain their value for the entire lifespan of a learner).
    struct AdditionalLearningOptions
@ -18,7 +19,6 @@ namespace CNTK
        double gaussianNoiseInjectionStdDev = 0.0;
        bool gradientClippingWithTruncation = true;
        double gradientClippingThresholdPerSample = std::numeric_limits<double>::infinity();
-        std::unordered_map<Parameter, double> learningRateMultipliers;
    };

    // An abstract base class at the root of the standard learners hierarchy
@ -33,32 +33,16 @@ namespace CNTK

        virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override final;

-        void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
-        {
-            m_additionalOptions = additionalOptions;
-        }
-
-        // TODO: should this be called ResetMomentum?
-        // needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
-        void ResetSmoothedGradients();
-
-        // TODO: move learning rate and momentum scheduling and adjustment functionality 
-        // inside the learner and drop these setters.
-        void SetLearningRate(double value) { m_learningRatePerSample = value; }
-
    protected:
-        LearnerBase(const std::unordered_set<Parameter>& parameters);
+        LearnerBase(const std::unordered_set<Parameter>& parameters, 
+                    const LearningRatesPerSample& learningRates,
+                    bool allocateSmoothGradients = true);

        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const = 0;

-        double ParameterDependentLearningRate(const Parameter& parameter) const
-        {
-            return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
-        }
-
        std::string LearnerType() const;

-        double m_learningRatePerSample;
+        LearningRatesPerSample m_learningRates;

        AdditionalLearningOptions m_additionalOptions;

@ -91,6 +75,16 @@ namespace CNTK
        template <typename ElementType>
        void PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;

+        // Returns an NDArrayView with the required shape, with the same data type as parameter value
+        // and allocated on the same device.
+        static NDArrayViewPtr AllocateNDArrayView(const Parameter& parameter, const NDShape& shape);
+
+        // Retrieves the shape of the matrix corresponding to the parameter value.
+        static NDShape GetMatrixShape(const Parameter& parameter);
+
+        size_t m_sampleCount;
+        size_t m_minibatchCount;
+
    private:
        // Templatized update function, it invokes preprocess and postprocess using the provided
        // template parameter and also invokes virtual Update method implemented in one of the subclasses.
@ -101,18 +95,20 @@ namespace CNTK
        static bool HasNan(const NDArrayViewPtr& value, const char* name);
        static void Print(const NDArrayViewPtr& value, const char* msg);

-        size_t m_sampleCount;
+        static const size_t checkpointVersion = 1;
    };

    // Vanilla gradient descent optimization algorithm.
    class LearnerSGD : public LearnerBase
    {
    public:
-        LearnerSGD(const std::unordered_set<Parameter>& parameters, double learningRatePerSample = 0)
-            : LearnerBase(parameters), m_momentumPerSample(0.0), m_useNesterovAcceleration(false)
-        {
-            SetLearningRate(learningRatePerSample);
-        }
+        LearnerSGD(const std::unordered_set<Parameter>& parameters, 
+                   const LearningRatesPerSample& learningRates, 
+                   bool allocateSmoothGradients = true)
+            : LearnerBase(parameters, learningRates, allocateSmoothGradients), 
+            m_momentums(0.0), 
+            m_useNesterovAcceleration(false)
+        { }

    protected:

@ -121,7 +117,8 @@ namespace CNTK
        template <typename ElementType>
        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;

-        double m_momentumPerSample;
+        // TODO: Move m_momentums to LearnerMomentumSGD as soon as NormalGrad is refactored.
+        MomentumsPerSample m_momentums;
        bool m_useNesterovAcceleration;
    };

@ -129,20 +126,25 @@ namespace CNTK
    class LearnerMomentumSGD : public LearnerSGD
    {
    public:
-        LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters)
-            : LearnerSGD(parameters)
-        {}
-
-        void SetMomentum(double value) { m_momentumPerSample = value; }
+        LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters, 
+                           const LearningRatesPerSample& learningRates,
+                           const MomentumsPerSample& momentums,
+                           bool allocateSmoothGradients = true)
+            : LearnerSGD(parameters, learningRates, allocateSmoothGradients)
+        {
+            m_momentums = momentums;
+        }
    };

    // Nesterov's accelerated SGDLearnerBase descent. 
-    class LearnerNesterov : public LearnerSGD
+    class LearnerNesterov : public LearnerMomentumSGD
    {
    public:

-        LearnerNesterov(const std::unordered_set<Parameter>& parameters)
-            : LearnerSGD(parameters)
+        LearnerNesterov(const std::unordered_set<Parameter>& parameters, 
+                        const LearningRatesPerSample& learningRates,
+                        const MomentumsPerSample& momentums)
+            : LearnerMomentumSGD(parameters, learningRates, momentums)
        {
            m_useNesterovAcceleration = true;
        }
@ -152,7 +154,9 @@ namespace CNTK
    {
    public:

-        LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier);
+        LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, 
+                       const LearningRatesPerSample& learningRates,
+                       bool needAveMultiplier);

    protected:
        bool m_needAveMultiplier;
@ -167,7 +171,9 @@ namespace CNTK
    {
    public:

-        LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters);
+        LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters,
+                         const LearningRatesPerSample& learningRates,
+                         const MomentumsPerSample& momentums);

    protected:

@ -182,7 +188,9 @@ namespace CNTK
    public:

        LearnerRMSProp(const std::unordered_set<Parameter>& parameters,
-                       double gamma, double inc, double dec, double max, double min, bool needAveMultiplier);
+                       const LearningRatesPerSample& learningRates,
+                       double gamma, double inc, double dec, double max, double min,
+                       bool needAveMultiplier);

    protected:

--- a/Source/CNTKv2LibraryDll/MinibatchSource.cpp
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.cpp
@ -49,10 +49,12 @@ namespace CNTK
            m_streamInfos.insert({ streamDesc->m_name, streamDesc->m_id, AsStorageFormat(streamDesc->m_storageType), AsDataType(streamDesc->m_elementType), AsNDShape(*(streamDesc->m_sampleLayout)) });
    }

-    /*virtual*/ std::unordered_map<StreamInfo, MinibatchData> CompositeMinibatchSource::GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
-                                                                                                         const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/) /*override*/
+    /*virtual*/ const std::unordered_map<StreamInfo, MinibatchData>&
+    CompositeMinibatchSource::GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+                                               const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/) /*override*/
    {
-        std::unordered_map<StreamInfo, MinibatchData> minibatchData;
+        m_minibatchData.clear();
+
        if (!m_epochEndReached)
        {
            // TODO: Support different minibatch sizes for different streams
@ -117,7 +119,9 @@ namespace CNTK
                auto currentStreamMinibatchData = compositeReaderMinibatchData.m_data[i];
                if (currentStreamDesc->m_elementType == ElementType::tfloat)
                {
-                    auto dataMatrix = std::make_shared<Matrix<float>>(CPUDEVICE);
+                    auto CNTKMatrixType = (currentStreamDesc->m_storageType == StorageType::dense) ? DENSE : SPARSE;
+                    auto CNTKMatrixFormat = (currentStreamDesc->m_storageType == StorageType::dense) ? matrixFormatDense : matrixFormatSparseCSC;
+                    auto dataMatrix = std::make_shared<Matrix<float>>(0, 0, CPUDEVICE, CNTKMatrixType, CNTKMatrixFormat);
                    size_t sampleSize = currentStreamDesc->m_sampleLayout->GetNumElements();

                    // TODO: Eliminate the unnecessary CPU to CPU copy
@ -127,14 +131,14 @@ namespace CNTK
                    size_t numSamples = currentStreamMinibatchData->m_layout->GetActualNumSamples();
                    size_t numSequences = currentStreamMinibatchData->m_layout->GetNumSequences();

-                    minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
+                    m_minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
                }
                else
                    LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
            }
        }

-        return minibatchData;
+        return m_minibatchData;
    }

    void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
--- a/Source/CNTKv2LibraryDll/MinibatchSource.h
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.h
@ -19,8 +19,8 @@ namespace CNTK

        virtual const std::unordered_set<StreamInfo>& StreamInfos() override { return m_streamInfos; }

-        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
-                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) override;
+        virtual const std::unordered_map<StreamInfo, MinibatchData>& GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+                                                                                      const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) override;

    private: 
        std::unordered_set<StreamInfo> m_streamInfos;
@ -28,5 +28,6 @@ namespace CNTK
        bool m_epochEndReached;
        size_t m_prevMinibatchSize;
        size_t m_epochSize;
+        std::unordered_map<StreamInfo, MinibatchData> m_minibatchData;
    };
 }
--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -61,11 +61,12 @@ namespace CNTK
                    LogicError("The gradient value for a Parameter cannot have an associated mask!");
            }

-            auto trainingLossArguments = m_trainingLossVar.Owner()->Arguments();
-            auto labelsVar = *(std::find_if(trainingLossArguments.begin(), trainingLossArguments.end(), [](const Variable& var) {
-                return var.IsInput();
-            }));
-            auto argumentValue = arguments.at(labelsVar);
+            auto trainingLossArgument = *(m_trainingLossVar.Owner()->Arguments().begin());
+
+            // Find the argument whose dynamic axes match the criterion operation's dynamic axes (i.e. label dynamic axes)
+            auto argumentValue = std::find_if(arguments.begin(), arguments.end(), [trainingLossArgument](const std::pair<Variable, ValuePtr>& currentPair) {
+                return (currentPair.first.DynamicAxes() == trainingLossArgument.DynamicAxes());
+            })->second;
            auto argumentData = argumentValue->Data();
            auto argumentDataShape = argumentData->Shape();
            auto mask = argumentValue->Mask();
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -6,20 +6,100 @@
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include "Utils.h"
-#include "File.h"
+#include <istream>
+#include <ostream>

 using namespace std;

 namespace CNTK
 {
+    // This wrapper redefines operator<< in terms of unformatted (binary) write operation.
+    struct BinaryOStreamWrapper
+    {
+        BinaryOStreamWrapper(ostream& s) : m_stream(s) {}
+
+        template<typename T>
+        typename std::enable_if<std::is_pod<T>::value, BinaryOStreamWrapper&>::type
+        operator<<(const T& value)
+        { 
+            m_stream.write(reinterpret_cast<const char*>(&value), sizeof(T)); 
+            return *this ; 
+        }
+
+        BinaryOStreamWrapper& operator<<(const wstring& str)
+        { 
+            *this << str.length();
+            m_stream.write(reinterpret_cast<const char*>(str.c_str()), str.length() * sizeof(wchar_t)); 
+            return *this; 
+        }
+
+        operator ostream& () { return m_stream; }
+
+        ostream& m_stream;
+        BinaryOStreamWrapper(const BinaryOStreamWrapper&) = delete; BinaryOStreamWrapper(BinaryOStreamWrapper&&) = delete; BinaryOStreamWrapper& operator=(const BinaryOStreamWrapper&) = delete; BinaryOStreamWrapper& operator=(BinaryOStreamWrapper&&) = delete;
+    };
+
+    // This wrapper redefines operator>> in terms of unformatted (binary) read operation.
+    struct BinaryIStreamWrapper
+    {
+        BinaryIStreamWrapper(istream& s) : m_stream(s) {}
+
+        template<typename T>
+        typename std::enable_if<std::is_pod<T>::value, BinaryIStreamWrapper&>::type
+        operator>>(T& value)
+        { 
+            static_assert(sizeof(T) <= sizeof(size_t), "size_t is the largest supported type.");
+            m_stream.read(buf, sizeof(T)); 
+            value = *(reinterpret_cast<T*>(buf));
+            return *this ; 
+        }
+
+        BinaryIStreamWrapper& operator>>(wstring& str)
+        { 
+            size_t length;
+            *this >> length;
+            str.resize(length);
+            for (size_t i = 0; i < length; ++i)
+            {
+                m_stream.read(buf, sizeof(wchar_t)); 
+                str[i] = *(reinterpret_cast<wchar_t*>(buf));
+            }
+
+            return *this; 
+        }
+
+        operator istream& () const { return m_stream ;}
+
+        istream& m_stream;
+        char buf[sizeof(size_t)];
+        BinaryIStreamWrapper(const BinaryIStreamWrapper&) = delete; BinaryIStreamWrapper(BinaryIStreamWrapper&&) = delete; BinaryIStreamWrapper& operator=(const BinaryIStreamWrapper&) = delete; BinaryIStreamWrapper& operator=(BinaryIStreamWrapper&&) = delete;
+    };
+
+    template <typename T>
+    T* CreateDataPtr(const T& value)
+    {
+        return new T(value);
+    }
+
+    template <>
+    NDArrayView* CreateDataPtr<NDArrayView>(const NDArrayView& value)
+    {
+        // TODO: replace this copy with an alias to value.
+        NDArrayView* viewPtr = new NDArrayView(value.GetDataType(), value.Shape(), DeviceDescriptor::CPUDevice());
+        viewPtr->CopyFrom(value);
+        return viewPtr;
+    }
+
    template <typename T>
    void DictionaryValue::AllocateDataPtr(const T& value)
    {
        static_assert(is_same<T, NDShape>::value ||
                      is_same<T, wstring>::value ||
                      is_same<T, vector<DictionaryValue>>::value ||
-                      is_same<T, Dictionary>::value, "AllocateDataPtr called with invalid type");
-        m_data.m_ptr = new T(value);
+                      is_same<T, Dictionary>::value ||
+                      is_same<T, NDArrayView>::value,
+                      "AllocateDataPtr called with invalid type");
+        m_data.m_ptr = CreateDataPtr<T>(value);
    }

    template <typename T>
@ -31,12 +111,163 @@ namespace CNTK
        m_data.m_ptr = nullptr;
    }

-    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
+    template <typename ElementType> 
+    bool AreEqual(NDArrayView& view1, NDArrayView& view2)
    {
+        if (view1.GetDataType() != view2.GetDataType() ||
+            view1.Shape() != view2.Shape())
+        {
+            return false;
+        }
+
+        ElementType* data1 = nullptr;
+        ElementType* data2 = nullptr;
+        if (view1.Device().Type() == DeviceKind::CPU)
+        {
+            data1 = view1.WritableDataBuffer<ElementType>();
+            data2 = view2.WritableDataBuffer<ElementType>();
+        }
+        else
+        {
+            NDArrayViewPtr temp1CpuDataView = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), view1.Shape(), DeviceDescriptor::CPUDevice());
+            temp1CpuDataView->CopyFrom(view1);
+            data1 = temp1CpuDataView->WritableDataBuffer<ElementType>();
+
+            NDArrayViewPtr temp2CpuDataView = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), view2.Shape(), DeviceDescriptor::CPUDevice());
+            temp2CpuDataView->CopyFrom(view2);
+            data2 = temp2CpuDataView->WritableDataBuffer<ElementType>();
+        }
+
+        size_t numElements = view1.Shape().TotalSize();
+
+        for (size_t i = 0; i < numElements; ++i)
+        {
+            if (data1[i] != data2[i])
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool DictionaryValue::operator==(const DictionaryValue& other) const
+    {
+        if (this == &other)
+        {
+            return true;
+        }
+
+        if (m_valueType != other.m_valueType)
+        {
+            return false;
+        }
+        
+        switch (m_valueType)
+        {
+        case DictionaryValue::Type::Bool:
+            return (m_data.m_boolean == other.m_data.m_boolean);
+        case DictionaryValue::Type::SizeT:
+            return (m_data.m_sizeT == other.m_data.m_sizeT);
+        case DictionaryValue::Type::Float:
+            return (m_data.m_float == other.m_data.m_float);
+        case DictionaryValue::Type::Double:
+            return (m_data.m_double == other.m_data.m_double);
+        case DictionaryValue::Type::String:
+        {
+            wstring* strPtr1 = reinterpret_cast<wstring*>(m_data.m_ptr);
+            wstring* strPtr2 = reinterpret_cast<wstring*>(other.m_data.m_ptr);
+            return (*strPtr1 == *strPtr2);
+        }
+        case DictionaryValue::Type::NDShape:
+        {
+            NDShape* shapePtr1 = reinterpret_cast<NDShape*>(m_data.m_ptr);
+            NDShape* shapePtr2 = reinterpret_cast<NDShape*>(other.m_data.m_ptr);
+            return (*shapePtr1 == *shapePtr2);
+        }
+        case DictionaryValue::Type::Vector:
+        {   
+            vector<DictionaryValue>* vectorPtr1 = reinterpret_cast<vector<DictionaryValue>*>(m_data.m_ptr);
+            vector<DictionaryValue>* vectorPtr2 = reinterpret_cast<vector<DictionaryValue>*>(other.m_data.m_ptr);
+            return (*vectorPtr1 == *vectorPtr2);
+        }
+        case DictionaryValue::Type::Dictionary:
+        {
+            Dictionary* dictPtr1 = reinterpret_cast<Dictionary*>(m_data.m_ptr);
+            Dictionary* dictPtr2 = reinterpret_cast<Dictionary*>(other.m_data.m_ptr);
+            return (*dictPtr1 == *dictPtr2);
+        }
+        case DictionaryValue::Type::NDArrayView:
+        {
+            NDArrayView* viewPtr1 = reinterpret_cast<NDArrayView*>(m_data.m_ptr);
+            NDArrayView* viewPtr2 = reinterpret_cast<NDArrayView*>(other.m_data.m_ptr);
+
+            switch (viewPtr1->GetDataType())
+            {
+            case DataType::Float:
+                return AreEqual<float>(*viewPtr1, *viewPtr2);
+            case DataType::Double:
+                return AreEqual<double>(*viewPtr1, *viewPtr2);
+            default:
+                NOT_IMPLEMENTED;
+            }
+        }
+        default:
+            NOT_IMPLEMENTED;
+        }
+    }
+    
+    bool DictionaryValue::operator!=(const DictionaryValue& other) const
+    {
+        return !(*this == other);    
+    }
+
+    
+    BinaryOStreamWrapper& operator<<(BinaryOStreamWrapper& stream, const NDShape& us)
+    {
+        auto size = us.NumAxes();
+        stream << size;
+        for (auto i = 0; i < size; i++)
+        {
+            stream << us[i];
+        }
+        return stream;
+    }
+
+    template <typename T>
+    void Write(BinaryOStreamWrapper& stream, const NDArrayView& view)
+    {
+        assert(view.Device().Type() == DeviceKind::CPU);
+
+        auto numElements = view.Shape().TotalSize();
+        const T* buffer = view.DataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            stream << buffer[i];
+        }
+    }
+
+    template <typename T>
+    void Read(BinaryIStreamWrapper& stream, NDArrayView& view)
+    {
+        assert(view.Device().Type() == DeviceKind::CPU);
+        
+        auto numElements = view.Shape().TotalSize();
+        T* buffer = view.WritableDataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            stream >> buffer[i];
+        }
+    }
+
+    istream& operator>>(istream& stdStream, DictionaryValue& us)
+    {
+        BinaryIStreamWrapper stream(stdStream);
        size_t version;
        stream >> version;
-
-        stream >> us.m_valueType;
+        
+        unsigned int type;
+        stream >> type;
+        us.m_valueType = static_cast<DictionaryValue::Type>(type);

        switch (us.ValueType())
        {
@ -52,28 +283,72 @@ namespace CNTK
        case DictionaryValue::Type::Double:
            stream >> us.m_data.m_double;
            break;
+        case DictionaryValue::Type::String:
+        {
+            wstring* strPtr = new wstring();
+            stream >> *strPtr;
+            us.m_data.m_ptr = strPtr;
+            break;
+        }
        case DictionaryValue::Type::NDShape:
        {
            size_t size;
            stream >> size;
-            vector<size_t> dims(size);
+            NDShape* shapePtr = new NDShape(size);
            for (auto i = 0; i < size; i++)
            {
-                stream >> dims[i];
+                stream >> shapePtr->operator[](i);
            }
-            us.AllocateDataPtr(NDShape(dims));
+            us.m_data.m_ptr = shapePtr;
            break;
        }
        case DictionaryValue::Type::Vector:
-        {
+        {   
            size_t size;
            stream >> size;
-            vector<DictionaryValue> values(size);
+            vector<DictionaryValue>* vectorPtr = new vector<DictionaryValue>(size);
            for (auto i = 0; i < size; i++)
            {
-                stream >> values[i];
+                stream >> vectorPtr->at(i);
            }
-            us.AllocateDataPtr(values);
+            us.m_data.m_ptr = vectorPtr;
+            break;
+        }
+        case DictionaryValue::Type::Dictionary:
+        {
+            Dictionary* dictPtr = new Dictionary();
+            stream >> *dictPtr;
+            us.m_data.m_ptr = dictPtr;
+            break;
+        }
+        case DictionaryValue::Type::NDArrayView:
+        {
+            unsigned int type;
+            stream >> type;
+            DataType dtype = static_cast<DataType>(type);
+
+            size_t size;
+            stream >> size;
+            NDShape shape(size);
+            for (auto i = 0; i < size; i++)
+            {
+                stream >> shape[i];
+            }
+
+            NDArrayView* viewPtr = new NDArrayView(dtype, shape, DeviceDescriptor::CPUDevice());
+            switch (dtype)
+            {
+            case DataType::Float:
+                Read<float>(stream, *viewPtr);
+                break;
+            case DataType::Double:
+                Read<double>(stream, *viewPtr);
+                break;
+            default:
+                LogicError("Unsupported DataType %s", DataTypeName(dtype));
+            }
+
+            us.m_data.m_ptr = viewPtr;
            break;
        }
        default:
@ -82,11 +357,13 @@ namespace CNTK
        return stream;
    }

-    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
+    ostream& operator<<(ostream& stdStream, const DictionaryValue& us)
    {
+        BinaryOStreamWrapper stream(stdStream);
+
        stream << us.version;

-        stream << us.ValueType();
+        stream << static_cast<unsigned int>(us.ValueType());

        switch (us.ValueType())
        {
@ -102,15 +379,16 @@ namespace CNTK
        case DictionaryValue::Type::Double:
            stream << us.m_data.m_double;
            break;
+        case DictionaryValue::Type::String:
+        {
+            wstring* stringPtr = reinterpret_cast<wstring*>(us.m_data.m_ptr);
+            stream << *stringPtr;
+            break;
+        }
        case DictionaryValue::Type::NDShape:
        {
            NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
-            auto size = shapePtr->NumAxes();
-            stream << size;
-            for (auto i = 0; i < size; i++)
-            {
-                stream << shapePtr->operator[](i);
-            }
+            stream << *shapePtr;
            break;
        }
        case DictionaryValue::Type::Vector:
@ -121,7 +399,31 @@ namespace CNTK
            stream << size;
            for (auto i = 0; i < size; i++)
            {
-                stream << vectorPtr->operator[](i);
+                stream << vectorPtr->at(i);
+            }
+            break;
+        }
+        case DictionaryValue::Type::Dictionary:
+        {
+            Dictionary* dictPtr = reinterpret_cast<Dictionary*>(us.m_data.m_ptr);
+            stream << *dictPtr;
+            break;
+        }
+        case DictionaryValue::Type::NDArrayView:
+        {
+            NDArrayView* viewPtr = reinterpret_cast<NDArrayView*>(us.m_data.m_ptr);
+            stream << static_cast<unsigned int>(viewPtr->GetDataType());
+            stream << viewPtr->Shape();
+            switch (viewPtr->GetDataType())
+            {
+            case DataType::Float:
+                Write<float>(stream, *viewPtr);
+                break;
+            case DataType::Double:
+                Write<double>(stream, *viewPtr);
+                break;
+            default:
+                LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
            }
            break;
        }
@ -148,7 +450,7 @@ namespace CNTK
    Dictionary& Dictionary::operator=(const Dictionary& other)
    {
        assert(this != &other);
-        m_dictionaryData.reset(new std::unordered_map<std::wstring, DictionaryValue>(*(other.m_dictionaryData)));
+        m_dictionaryData.reset(new unordered_map<wstring, DictionaryValue>(*(other.m_dictionaryData)));
        return *this;
    }

@ -183,20 +485,51 @@ namespace CNTK
        return (m_dictionaryData->find(key) != m_dictionaryData->end());
    }

-    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
+    bool Dictionary::operator==(const Dictionary& other) const
    {
+        if (this == &other)
+        {
+            return true;
+        }
+
+        if (m_dictionaryData->size() != other.m_dictionaryData->size())
+        {
+            return false;
+        }
+        
+        for (auto& kv : *m_dictionaryData)
+        {
+            auto result = other.m_dictionaryData->find(kv.first);
+            if (result == other.m_dictionaryData->end() || kv.second != result->second)
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+    
+    bool Dictionary::operator!=(const Dictionary& other) const
+    {
+        return !(*this == other);    
+    }
+
+    ostream& operator<<(ostream& stdStream, const Dictionary& us)
+    {
+        BinaryOStreamWrapper stream(stdStream);
        stream << us.version;
        stream << us.m_dictionaryData->size();
-        for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
+        for (auto& kv : *(us.m_dictionaryData))
        {
-            stream << it->first;
-            stream << it->second;
+            stream << kv.first;
+            stream << kv.second;
        }
        return stream;
    }

-    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
+    istream& operator>>(istream& stdStream, Dictionary& us)
    {
+        BinaryIStreamWrapper stream(stdStream);
        size_t version;
        stream >> version;
        size_t size;
@ -206,113 +539,36 @@ namespace CNTK
        {
            wstring key;
            stream >> key;
-            DictionaryValue value;
-            stream >> value;
-            us.m_dictionaryData->insert(make_pair(key, value));
+            stream >> us[key];
        }
        return stream;
    }

+    // Returns the element whose key is greater than the required sample count 
+    // or the last element if no such key exists.
    template <typename T>
-    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    const T& TrainingParameterSchedule<T>::operator[](size_t sampleCount) const
    {
-        if (viewPtr->IsSparse())
+        assert(m_schedule.size() > 0);
+        auto it = m_schedule.upper_bound(sampleCount);
+        if (it == m_schedule.end())
        {
-            LogicError("Sparse NDArrayView cannot be serialized into a vector.");
+            --it;
        }
-
-        auto numElements = viewPtr->Shape().TotalSize();
-
-        vector<DictionaryValue> values(numElements);
-
-        NDArrayViewPtr cpuDataViewPtr = viewPtr;
-        if ((viewPtr->Device().Type() != DeviceKind::CPU))
-        {
-            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
-            cpuDataViewPtr->CopyFrom(*viewPtr);
-        }
-
-        const T* buffer = cpuDataViewPtr->DataBuffer<T>();
-        for (auto i = 0; i < numElements; ++i)
-        {
-            T v = buffer[i];
-            values[i] = DictionaryValue(v);
-        }
-
-        return values;
+        return it->second;
    }

-    template <typename T>
-    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
-    {
-        if (viewPtr->IsSparse())
-        {
-            LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
-        }
-
-        auto numElements = viewPtr->Shape().TotalSize();
-
-        if (values.size() != numElements)
-        {
-            LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
-                        values.size(), numElements);
-        }
-
-        NDArrayViewPtr cpuDataViewPtr = viewPtr;
-        if ((viewPtr->Device().Type() != DeviceKind::CPU))
-        {
-            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
-        }
-
-        T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
-        for (auto i = 0; i < numElements; ++i)
-        {
-            buffer[i] = values[i].GetValue<T>();
-        }
-
-        if ((viewPtr->Device().Type() != DeviceKind::CPU))
-        {
-            viewPtr->CopyFrom(*cpuDataViewPtr);
-        }
-    }
-
-    // TODO: we store the type info for every element in the vector, which is extremely redundant.
-    // Instead, it'd be nice to introduce some sort of DictionaryValueVector.
-    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
-    {
-        switch (viewPtr->GetDataType())
-        {
-        case DataType::Float:
-            return SerializeToVector<float>(viewPtr);
-        case DataType::Double:
-            return SerializeToVector<double>(viewPtr);
-        default:
-            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
-        }
-    }
-
-    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values) 
-    {
-        switch (viewPtr->GetDataType())
-        {
-        case DataType::Float:
-            DeserializeFromVector<float>(viewPtr, values);
-            break;
-        case DataType::Double:
-            DeserializeFromVector<double>(viewPtr, values);
-            break;
-        default:
-            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
-        }
-    }
-     
    template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
    template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
    template void DictionaryValue::AllocateDataPtr<wstring>(const wstring& value);
    template void DictionaryValue::AllocateDataPtr<Dictionary>(const Dictionary& value);
+    template void DictionaryValue::AllocateDataPtr<NDArrayView>(const NDArrayView& value);

    template void DictionaryValue::FreePtrAsType<NDShape>();
    template void DictionaryValue::FreePtrAsType<vector<DictionaryValue>>();
    template void DictionaryValue::FreePtrAsType<wstring>();
    template void DictionaryValue::FreePtrAsType<Dictionary>();
+    template void DictionaryValue::FreePtrAsType<NDArrayView>();
+
+    template const double& TrainingParameterSchedule<double>::operator[](size_t key) const;
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -167,9 +167,6 @@ namespace CNTK
        return var.IsInput() && var.IsSparse();
    }

-    std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
-
-    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);

    inline void AddIndentation(std::wstringstream& s, size_t numIndentationSpaces)
    {
@ -250,7 +247,8 @@ namespace CNTK
        static_assert(std::is_same<T, bool>::value ||
                      std::is_same<T, size_t>::value ||
                      std::is_same<T, float>::value ||
-                      std::is_same<T, double>::value, "Unsupported ValueType");
+                      std::is_same<T, double>::value ||
+                      std::is_same<T, std::wstring>::value, "Unsupported ValueType");

        std::vector<DictionaryValue> dictionaryValueVector;
        for (auto value : basicElementTypeVector)
@ -265,7 +263,8 @@ namespace CNTK
        static_assert(std::is_same<T, bool>::value ||
            std::is_same<T, size_t>::value ||
            std::is_same<T, float>::value ||
-            std::is_same<T, double>::value, "Unsupported ValueType");
+            std::is_same<T, double>::value ||
+            std::is_same<T, std::wstring>::value, "Unsupported ValueType");

        std::vector<T> basicElementTypeVector;
        for (auto value : dictionaryValueVector)
@ -313,4 +312,19 @@ namespace CNTK

        return{ paddedOutputMapCount, kernelShape };
    }
+
+    inline CNTK::Constant ScalarConstant(CNTK::DataType dataType, float value, const CNTK::DeviceDescriptor& device = CNTK::DeviceDescriptor::CPUDevice())
+    {
+        if (dataType == CNTK::DataType::Float)
+            return CNTK::Constant({}, value, device);
+        else if (dataType == CNTK::DataType::Double)
+            return CNTK::Constant({}, (double)value, device);
+        else
+            LogicError("CNTK::ScalarConstant: Unsupported DataType %s", DataTypeName(dataType));
+    }
+
+    inline double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
+    {
+        return std::pow(momentumPerSample, minibatchSize);
+    }
 }
--- a/Source/CNTKv2LibraryDll/Variable.cpp
+++ b/Source/CNTKv2LibraryDll/Variable.cpp
@ -7,6 +7,8 @@

 namespace CNTK
 {
+    /*static*/ const std::vector<Axis> Variable::s_defaultInputVariableDynamicAxes = { Axis::DefaultDynamicAxis(), Axis::DefaultBatchAxis() };
+
    Variable::Variable(const FunctionPtr& function)
        : Variable(function->Output())
    {
--- a/Source/Common/Include/Platform.h
+++ b/Source/Common/Include/Platform.h
@ -11,6 +11,13 @@
 #define __UNIX__
 #endif

+#ifdef _MSC_VER
+// TODO: thread_local is supported in VS2015. Remove this macro when we uprade to VS2015
+#define THREAD_LOCAL __declspec(thread)
+#else
+#define THREAD_LOCAL thread_local
+#endif
+
 // ===========================================================================
 // compiler differences
 // ===========================================================================
--- a/Source/Common/Include/RandomOrdering.h
+++ b/Source/Common/Include/RandomOrdering.h
@ -11,6 +11,7 @@
 #include <stdio.h>
 #include <vector>
 #include <algorithm>
+#include <random>

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -24,6 +25,31 @@ static inline size_t rand(const size_t begin, const size_t end)
    return begin + randno % (end - begin);
 }

+// Rand based on Mersenne Twister.
+// We use our own distribution in order to match baselines between different operating systems,
+// because uniform_distribution is not guranteed to provide the same numbers on different platforms.
+// TODO: Switching to Boost would eliminate this problem.
+static inline size_t RandMT(const size_t begin, const size_t end, std::mt19937_64& rng)
+{
+    const size_t randomNumber = rng();
+    return begin + randomNumber % (end - begin);
+}
+
+// Rand based on Mersenne Twister.
+// We use our own distribution in order to match baselines between different operating systems,
+// instead of using std::shuffle which uses unitform_distribution internally.
+// TODO: Switching to Boost would eliminate this problem.
+template <typename TVector>
+inline void RandomShuffleMT(TVector& v, std::mt19937_64& rng)
+{
+    foreach_index(currentLocation, v)
+    {
+        // Pick a random location a location and swap with current
+        const size_t randomLocation = RandMT(0, v.size(), rng);
+        std::swap(v[currentLocation], v[randomLocation]);
+    }
+}
+
 class RandomOrdering // note: NOT thread-safe at all
 {
    // constants for randomization
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -258,13 +258,20 @@ public:
        m_evalOrders[rootNode] = nodes;
    }

+    bool EvalOrderExists(const ComputationNodeBasePtr& rootNode) const
+    {
+        return m_evalOrders.find(rootNode) != m_evalOrders.end();
+    }
+
    // get depth-first traversal order
    // TODO: This is currently not immutable because it gets patched w.r.t. recurrent loops. Ideally we don't patch. Need to review and verify that it is sufficient.
    const std::list<ComputationNodeBasePtr>& GetEvalOrder(const ComputationNodeBasePtr& rootNode) const
    {
        auto iter = m_evalOrders.find(rootNode);
        if (iter == m_evalOrders.end())
+        {
            LogicError("GetEvalOrder: Called without prior call to FormEvalOrder() for %ls %ls operation", rootNode->NodeName().c_str(), rootNode->OperationName().c_str());
+        }
        return iter->second;
    }

--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@ -76,6 +76,9 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,

    ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);

+    if (!fromNet.EvalOrderExists(fromRoot))
+        const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
+
    for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
    {
        wstring fromNodeName = fromNode->NodeName();
@ -353,6 +356,9 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
    else
    {
        // for calculating a specific node
+        if (!EvalOrderExists(rootNode))
+            const_cast<ComputationNetwork&>(*this).FormEvalOrder(rootNode);
+
        for (const auto& node : GetAllNodesForRoot(rootNode))
        {
            if (node->OperationName() == OperationNameOf(LearnableParameter))
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -32,15 +32,16 @@
 #define CNTK_MODEL_VERSION_1 1
 #define CNTK_MODEL_VERSION_2 2
 #define CNTK_MODEL_VERSION_3 3
-#define CNTK_MODEL_VERSION_4 4 // PastValue
-#define CNTK_MODEL_VERSION_5 5 // ND convolution and pooling
-#define CNTK_MODEL_VERSION_6 6 // batch-norm blending
-#define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
-#define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
-#define CNTK_MODEL_VERSION_9 9 // transpose flag in ConvolutionNode to support deconvolution
+#define CNTK_MODEL_VERSION_4 4   // PastValue
+#define CNTK_MODEL_VERSION_5 5   // ND convolution and pooling
+#define CNTK_MODEL_VERSION_6 6   // batch-norm blending
+#define CNTK_MODEL_VERSION_7 7   // ElemType tag in model file
+#define CNTK_MODEL_VERSION_8 8   // DynamicAxis for inputs
+#define CNTK_MODEL_VERSION_9 9   // transpose flag in ConvolutionNode to support deconvolution
 #define CNTK_MODEL_VERSION_10 10 // learning-rate multiplier for input nodes
-#define CNTK_MODEL_VERSION_11 11 // Times() m_inputRank to support parameter-rank inference
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_11
+#define CNTK_MODEL_VERSION_11 11 // dynamic axis name for where nodes
+#define CNTK_MODEL_VERSION_12 12 // Times() m_inputRank to support parameter-rank inference
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_12

 extern bool g_shareNodeValueMatrices;

--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -365,6 +365,7 @@ public:
        TensorShape outputShape;
        // If 2D convolution syntax is used then some of the tensor dimensions need to be inferred.
        if (m_convolution2D)
+        // NOTE: when m_convolution2D is true, it's a legacy branch. Code should not enter here any more. 
        {
            // Need to update some tensors with correct input dims.
            auto inDims = ImageDimensions(GetInputSampleLayout(inputIdx), m_imageLayout);
@ -396,6 +397,8 @@ public:

            outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
                                                                m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+            // ConvolveGeometry always uses CHW.
+            SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
        }
        else
        {
@ -414,9 +417,12 @@ public:
                outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
            }
+
+            if (m_imageLayout == ImageLayoutKind::CHW) 
+                SetDims(outputShape, HasMBLayout());
+            else    // legacy format 
+                SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
        }
-        // ConvolveGeometry always uses CHW.
-        SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());

        // update LearnableParameter if it has 0 dimensions (to be inferred)
        // Typically this would be the #inputChannels (C).
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -450,9 +450,9 @@ public:
                assert(dimsA.size() == m_outputRank + numReductionDims);
                while (numReductionDims < inputRank)
                {
-                    dimsA.push_back(0);
-                    numReductionDims++;
-                }
+                dimsA.push_back(0);
+                numReductionDims++;
+            }
            }

            // fill in the missing ones
@ -561,8 +561,8 @@ class TransposeTimesNode : public TimesNodeBase<ElemType, true>

 public:
    DeclareConstructorFromConfigWithNumInputs(TransposeTimesNode);
-    TransposeTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name, /*outputRank=*/1, /*inputRank=*/1)
+    TransposeTimesNode(DEVICEID_TYPE deviceId, const wstring& name, size_t outputRank = 1)
+        : Base(deviceId, name, outputRank, /*inputRank=*/1)
    {
    }
 };
@ -665,6 +665,9 @@ public:
            m_axis1 = 1, m_axis2 = 2; // default
    }

+    int Axis1() const { return m_axis1; }
+    int Axis2() const { return m_axis2; }
+
 private:
    // compute the transposed tensor shape (in-place)
    void TransposeShape(TensorShape& shape) const
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -300,7 +300,7 @@ template <class ElemType>
    if (!m_pMBLayout)
    {
        m_pMBLayout = make_shared<MBLayout>(); // this generates a new layout
-        m_pMBLayout->SetUniqueAxisName(L"WhereNodeAxis");
+        m_pMBLayout->SetUniqueAxisName(m_dynamicAxisName);
    }
    // we map scalars to scalars
    if (isFinalValidationPass && Input(0)->GetSampleLayout().GetNumElements() != 1)
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -217,6 +217,9 @@ public:
    virtual bool /*ComputationNodeBase::*/ InputUsedInComputingInputNodesGradients(size_t childIndex) const override;
    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override;

+    std::wstring ReductionOpName() const { return m_operation; }
+    int ReductionAxis() const { return m_axis; }
+
 private:
    // operation attributes
    int m_axis;
@ -341,11 +344,12 @@ public:
        fstream << m_axis;
    }

-private:
-
    // these implement numpy-style negative bound values to index from the end
    size_t BeginIndex() const { return m_beginIndex >= 0 ? (size_t)m_beginIndex : (size_t)(m_beginIndex + Input(0)->GetSampleLayout()[m_axis - 1]); }
-    size_t EndIndex()   const { return m_endIndex   >  0 ? (size_t)m_endIndex   : (size_t)(m_endIndex   + Input(0)->GetSampleLayout()[m_axis - 1]); }
+    size_t EndIndex()   const { return m_endIndex   >  0 ? (size_t)m_endIndex : (size_t)(m_endIndex + Input(0)->GetSampleLayout()[m_axis - 1]); }
+    int Axis() const { return m_axis; }
+
+private:

    // determine the tensor shape that represents slice of the input that we are taking
    TensorShape GetInputSlice(size_t rank, const FrameRange & fr) const
@ -655,10 +659,11 @@ class WhereNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1
    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
    static const std::wstring TypeName() { return L"Where"; }

+    static const std::wstring DefaultWhereNodeDynamicAxisName() { return L"WhereNodeAxis"; }
 public:
    DeclareConstructorFromConfigWithNumInputs(WhereNode);
-    WhereNode(DEVICEID_TYPE deviceId, const wstring& name) :
-        Base(deviceId, name)
+    WhereNode(DEVICEID_TYPE deviceId, const wstring& name, const wstring& dynamicAxisName = DefaultWhereNodeDynamicAxisName()) :
+        Base(deviceId, name), m_dynamicAxisName(dynamicAxisName)
    {
        MarkValueNonSharable();
    }
@ -669,11 +674,29 @@ public:
    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
    virtual void Validate(bool isFinalValidationPass) override;

+    virtual void Load(File& fstream, size_t modelVersion) override
+    {
+        Base::Load(fstream, modelVersion);
+        if (modelVersion >= CNTK_MODEL_VERSION_11)
+            fstream >> m_dynamicAxisName;
+        else
+            m_dynamicAxisName = DefaultWhereNodeDynamicAxisName();
+    }
+
+    virtual void Save(File& fstream) const override
+    {
+        Base::Save(fstream);
+        fstream << m_dynamicAxisName;
+    }
+
+    std::wstring DynamicAxisName() const { return m_dynamicAxisName; }
+
 private:
    // buffers for creating the result sequences (kept as object state to avoid memory allocations)
    std::vector<std::vector<size_t>>   m_indexSequenceBuffer; // [sequenceIndex][t] for creating the result sequences
    std::vector<size_t>               m_rowAllocationsBuffer; // [row] for determining new MBLayout packing
    std::vector<std::pair<size_t, size_t>> m_placementBuffer; // [sequenceIndex] assigned location for a sequence
+    std::wstring m_dynamicAxisName;
 };

 // -----------------------------------------------------------------------
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -37,14 +37,8 @@
 #pragma warning(disable : 4244) // unreachable code; triggered for unknown reasons
 #pragma warning(disable : 4702) // conversion from 'double' to 'float'

-#ifdef USE_ACML
-// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
-// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
-// Install the ifort64_mp variant (compiled with intel compiler) of the library
-// Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
-// to point to your folder for the include file and link library
-#include <acml.h> // requires ACML 5.3.1 and above
-#elif defined(USE_MKL)
+
+#ifdef USE_MKL
 // requires MKL 10.0 and above
 #include <mkl.h>
 #else
@ -57,12 +51,6 @@
 #include <lapacke.h>
 #endif

-#ifdef USE_ACML // MKL has one additional parameter for different matrix order
-#define BLAS_COLMAJOR
-#else
-#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
-#endif
-
 #define SWAP(a, b)  \
    {               \
        (a) ^= (b); \
@ -912,11 +900,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
 #pragma omp parallel for
                    foreach_column (j, us)
                    {
-#ifdef USE_ACML
-                        dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(bufPtr + LocateColumn(j)), 1);
-#else
                        cblas_dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(bufPtr + LocateColumn(j)), 1);
-#endif
                    }
                }
                else
@ -926,11 +910,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
                    {
                        {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                            scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(bufPtr + LocateColumn(j)), 1);
-#else
                            cblas_scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(bufPtr + LocateColumn(j)), 1);
-#endif
                        }
                    }
                }
@ -2844,20 +2824,12 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        return (ElemType) dasum((int) GetNumElements(), reinterpret_cast<double*>(Data()), 1);
-#else
        return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast<double*>(Data()), 1);
-#endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        return sasum((int) GetNumElements(), reinterpret_cast<float*>(Data()), 1);
-#else
        return cblas_sasum((int) GetNumElements(), reinterpret_cast<float*>(Data()), 1);
-#endif
    }
 }

@ -3028,11 +3000,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifdef USE_ACML
-                c(0, j) = (ElemType) dnrm2(m, reinterpret_cast<double*>(bufPtr + us.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast<double*>(bufPtr + us.LocateColumn(j)), 1);
-#endif
            }
        }
        else
@ -3041,11 +3009,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(0, j) = snrm2(m, reinterpret_cast<float*>(bufPtr + us.LocateColumn(j)), 1);
-#else
                c(0, j) = cblas_snrm2(m, reinterpret_cast<float*>(bufPtr + us.LocateColumn(j)), 1);
-#endif
            }
        }
    }
@ -3058,11 +3022,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifdef USE_ACML
-                c(i, 0) = dnrm2(n, reinterpret_cast<double*>(bufPtr + i), m);
-#else
                c(i, 0) = cblas_dnrm2(n, reinterpret_cast<double*>(bufPtr + i), m);
-#endif
            }
        }
        else
@ -3071,11 +3031,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(i, 0) = snrm2(n, reinterpret_cast<float*>(bufPtr + i), m);
-#else
                c(i, 0) = cblas_snrm2(n, reinterpret_cast<float*>(bufPtr + i), m);
-#endif
            }
        }
    }
@ -4486,34 +4442,22 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix

    int m, n, k, l;
    int lda, ldb, ldc;
-#ifdef USE_ACML
-    char transA, transB;
-#else
    CBLAS_TRANSPOSE mklTransA;
    CBLAS_TRANSPOSE mklTransB;
-#endif

    if (transposeA)
    {
        m = (int) a.GetNumCols();
        k = (int) a.GetNumRows();
        lda = k;
-#ifdef USE_ACML
-        transA = (char) MatrixTranspose::Trans;
-#else
        mklTransA = CBLAS_TRANSPOSE::CblasTrans;
-#endif
    }
    else
    {
        m = (int) a.GetNumRows();
        k = (int) a.GetNumCols();
        lda = m;
-#ifdef USE_ACML
-        transA = (char) MatrixTranspose::NoTrans;
-#else
        mklTransA = CBLAS_TRANSPOSE::CblasNoTrans;
-#endif
    }

    if (transposeB)
@ -4521,22 +4465,14 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
        l = (int) b.GetNumCols();
        n = (int) b.GetNumRows();
        ldb = n;
-#ifdef USE_ACML
-        transB = (char) MatrixTranspose::Trans;
-#else
        mklTransB = CBLAS_TRANSPOSE::CblasTrans;
-#endif
    }
    else
    {
        l = (int) b.GetNumRows();
        n = (int) b.GetNumCols();
        ldb = l;
-#ifdef USE_ACML
-        transB = (char) MatrixTranspose::NoTrans;
-#else
        mklTransB = CBLAS_TRANSPOSE::CblasNoTrans;
-#endif
    }

    assert(m > 0 && k > 0 && l > 0 && n > 0); // converting from size_t to int may cause overflow
@ -4553,20 +4489,12 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        dgemm(transA, transB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
-#else
-        cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
-#endif
+        cblas_dgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
-#else
-        cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
-#endif
+        cblas_sgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
    }
 }

@ -4611,9 +4539,7 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), (int) ldu, reinterpret_cast<double*>(VT.Data()), (int) ldvt, &info);
-#elif defined(USE_MKL)
+#ifdef USE_MKL
        double wkopt;
        int lwork = -1;
        dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.Data()), &lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), &ldu, reinterpret_cast<double*>(VT.Data()), &ldvt, &wkopt, &lwork, &info);
@ -4622,16 +4548,13 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
        dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.Data()), &lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), &ldu, reinterpret_cast<double*>(VT.Data()), &ldvt, reinterpret_cast<double*>(W.Data()), &lwork, &info);
 #else
        std::vector<double> superb(std::max(std::min(m, n) - 1, 1));
-        info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()),
+        info = LAPACKE_dgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()),
            reinterpret_cast<double*>(U.Data()), (int) ldu, reinterpret_cast<double*>(VT.Data()), (int) ldvt, &superb[0]);
 #endif
    }
    else
    {
-#ifdef USE_ACML
-#pragma warning(suppress : 4244)
-        sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), (int) ldu, reinterpret_cast<float*>(VT.Data()), (int) ldvt, &info);
-#elif defined(USE_MKL)
+#ifdef USE_MKL
        float wkopt;
        int lwork = -1;
        sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.Data()), &lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), &ldu, reinterpret_cast<float*>(VT.Data()), &ldvt, &wkopt, &lwork, &info);
@ -4640,7 +4563,7 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
        sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.Data()), &lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), &ldu, reinterpret_cast<float*>(VT.Data()), &ldvt, reinterpret_cast<float*>(W.Data()), &lwork, &info);
 #else
        std::vector<float> superb(std::max(std::min(m, n) - 1, 1));
-        info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()),
+        info = LAPACKE_sgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()),
            reinterpret_cast<float*>(U.Data()), (int) ldu, reinterpret_cast<float*>(VT.Data()), (int) ldvt, &superb[0]);
 #endif
    }
@ -4837,20 +4760,12 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&

        if (sizeof(ElemType) == sizeof(double))
        {
-#ifdef USE_ACML
-            daxpy(len, alpha, reinterpret_cast<double*>(a.Data()), incx, reinterpret_cast<double*>(c.Data()), incy);
-#else
            cblas_daxpy(len, alpha, reinterpret_cast<double*>(a.Data()), incx, reinterpret_cast<double*>(c.Data()), incy);
-#endif
        }
        else
        {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-            saxpy(len, alpha, reinterpret_cast<float*>(a.Data()), incx, reinterpret_cast<float*>(c.Data()), incy);
-#else
            cblas_saxpy(len, alpha, reinterpret_cast<float*>(a.Data()), incx, reinterpret_cast<float*>(c.Data()), incy);
-#endif
        }
    }
    else if (a.GetNumElements() == 1) // scalar, add to all elements
@ -4889,11 +4804,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifdef USE_ACML
-                daxpy(m, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + c.LocateColumn(j)), 1);
-#else
                cblas_daxpy(m, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + c.LocateColumn(j)), 1);
-#endif
            }
        }
        else
@ -4902,11 +4813,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                saxpy(m, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + c.LocateColumn(j)), 1);
-#else
                cblas_saxpy(m, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + c.LocateColumn(j)), 1);
-#endif
            }
        }
    }
@ -4925,11 +4832,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifdef USE_ACML
-                daxpy(n, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + i), m);
-#else
                cblas_daxpy(n, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + i), m);
-#endif
            }
        }
        else
@ -4938,11 +4841,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                saxpy(n, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + i), m);
-#else
                cblas_saxpy(n, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + i), m);
-#endif
            }
        }
    }
@ -5163,20 +5062,12 @@ template <class ElemType>
    }
    else if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        dscal(len, alpha, reinterpret_cast<double*>(a.Data()), incx); // TODO: Use overloads.
-#else
        cblas_dscal(len, alpha, reinterpret_cast<double*>(a.Data()), incx);
-#endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        sscal(len, alpha, reinterpret_cast<float*>(a.Data()), incx);
-#else
        cblas_sscal(len, alpha, reinterpret_cast<float*>(a.Data()), incx);
-#endif
    }
 }

@ -5224,11 +5115,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
            foreach_column (j, c)
            {
-#ifdef USE_ACML
-                c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
-#endif
            }
        }
        else
@ -5237,11 +5124,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
-#endif
            }
        }
    }
@ -5256,11 +5139,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifdef USE_ACML
-                c(i, 0) = ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
-#else
                c(i, 0) = cblas_ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
-#endif
            }
        }
        else
@ -5269,11 +5148,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(i, 0) = sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
-#else
                c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
-#endif
            }
        }
    }
@ -5298,20 +5173,12 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.Data()), 1, reinterpret_cast<double*>(b.Data()), 1);
-#else
        return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.Data()), 1, reinterpret_cast<double*>(b.Data()), 1);
-#endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.Data()), 1, reinterpret_cast<float*>(b.Data()), 1);
-#else
        return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.Data()), 1, reinterpret_cast<float*>(b.Data()), 1);
-#endif
    }
 }

@ -5539,21 +5406,13 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
        {
            for (long j = 0; j < n; j++)
            {
-#ifdef USE_ACML
-                c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
-#endif
            }
            for (long j = 0; j < n; j++)
            {
                for (long i = 1; i < negnumber + 1; i++)
                {
-#ifdef USE_ACML
-                    c(i, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
-#else
                    c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
-#endif
                }
            }
        }
@ -5561,21 +5420,13 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
        {
            for (long j = 0; j < n; j++)
            {
-#ifdef USE_ACML
-                c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
-#else
                c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
-#endif
            }
            for (long j = 0; j < n; j++)
            {
                for (long i = 1; i < negnumber + 1; i++)
                {
-#ifdef USE_ACML
-                    c(i, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
-#else
                    c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
-#endif
                }
            }
        }
@ -5593,11 +5444,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
 #pragma omp parallel for
            foreach_row (i, c)
            {
-#ifdef USE_ACML
-                c(i, 0) = (ElemType) ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
-#else
                c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
-#endif
            }
        }
        else
@ -5606,11 +5453,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-                c(i, 0) = sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
-#else
                c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
-#endif
            }
        }
    }
@ -6025,13 +5868,11 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
    omp_set_num_threads(numThreads);
    numThreads = omp_get_max_threads();

-#ifdef USE_ACML
-    acmlsetnumthreads(numThreads);
-#elif defined(USE_MKL)
-    mkl_set_num_threads(numThreads);
-#elif defined(USE_OPENBLAS)
-    openblas_set_num_threads(numThreads);
-#endif
+    #ifdef USE_MKL
+        mkl_set_num_threads(numThreads);
+    #elif defined(USE_OPENBLAS)
+        openblas_set_num_threads(numThreads);
+    #endif
 #endif
    return numThreads;
 }
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -23,15 +23,7 @@

 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this

-#ifdef USE_ACML
-// use ACML as default.
-// Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above
-// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
-// Install the ifort64 variant (compiled with intel compiler) of the library
-// Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
-// to point to your folder for the include file and link library
-#include <acml.h> // requires ACML 5.3.0 and above
-#elif defined(USE_MKL)
+#ifdef USE_MKL
 // requires MKL 10.0 and above
 #include <mkl.h>
 #else
@ -53,12 +45,6 @@
 //    return 42;
 //}

-#ifdef USE_ACML // MKL has one additional parameter for different matrix order
-#define BLAS_COLMAJOR
-#else
-#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
-#endif
-
 // TODO: Move to CommonMatrix.h
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing

@ -261,11 +247,23 @@ void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
    RequireSizeAndAllocate(v.GetNumRows(), v.GetNumCols(), v.NzSize());
    let nz = v.NzCount();

+    auto matrixFormat = v.GetFormat();
+    if (((matrixFormat == matrixFormatSparseBlockCol) || (matrixFormat == matrixFormatSparseBlockRow)) && (v.GetBlockIdShift() > 0))
+        NOT_IMPLEMENTED;
+
    if (nz > 0)
    {
        memcpy(NzValues(),    v.NzValues(),    v.NzSize());
-        memcpy(RowLocation(), v.RowLocation(), v.RowSize());
-        memcpy(ColLocation(), v.ColLocation(), v.ColSize());
+
+        if ((matrixFormat == matrixFormatSparseCSC) || (matrixFormat == matrixFormatSparseCSR))
+        {
+            memcpy(RowLocation(), v.RowLocation(), v.RowSize());
+            memcpy(ColLocation(), v.ColLocation(), v.ColSize());
+        }
+        else
+        {
+            memcpy(GetBlockIds(), v.GetBlockIds(), v.GetBlockSize());
+        }
    }
    if (v.m_sliceViewOffset > 0)
    {
@ -384,6 +382,66 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::DoGatherColumnsOf(ElemType
    return *this;
 }

+// *this[:,idx[j]] = a[:,j] * alpha + *this[:,idx[j]] * beta
+template <class ElemType>
+CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const CPUMatrix<ElemType>& idx, const CPUSparseMatrix<ElemType>& a, ElemType alpha)
+{
+    VerifyWritable(__func__);
+
+    if ((a.GetFormat() != matrixFormatSparseCSC) || (GetFormat() != matrixFormatSparseCSC))
+        NOT_IMPLEMENTED;
+
+    if (idx.GetNumRows() != 1) // index is 1-dimensional only
+        InvalidArgument("DoScatterColumnsOf: Map must be a row vector.");
+
+    if (beta != 0)
+        NOT_IMPLEMENTED;
+
+    if (NzCount() != 0)
+        InvalidArgument("CPUSparseMatrix::DoScatterColumnsOf: The target matrix cannot have pre-existing non-zero values when being scattered into");
+
+    size_t numNonZeroElements = a.NzCount();
+
+    if (beta == 0)
+        RequireSizeAndAllocate(GetNumRows(), GetNumCols(), numNonZeroElements);
+
+    // Setup the Secondary index
+    std::vector<int> columnElementCounts(GetNumCols(), 0);
+    size_t numColsToWrite = idx.GetNumCols();
+    for (long j = 0; j < numColsToWrite; j++)
+    {
+        auto jOutF = idx(0, j); // this is the column we need to write to
+        if (::isnan(jOutF) || (jOutF < 0))     // negative index means gap
+            continue;
+        size_t jOut = (size_t)jOutF;
+        columnElementCounts[jOut] = a.SecondaryIndexLocation()[j + 1] - a.SecondaryIndexLocation()[j];
+    }
+
+    // TODO: Replace with std::exclusive_scan when we switch to C++17
+    for (size_t i = 1; i <= GetNumCols(); ++i)
+        SecondaryIndexLocation()[i] = SecondaryIndexLocation()[i - 1] + columnElementCounts[i - 1];
+    
+    size_t offset = a.SecondaryIndexLocation()[0];
+    // TODO: Does it make sense to parallelize this?
+    for (long j = 0; j < numColsToWrite; j++)
+    {
+        auto jOutF = idx(0, j); // this is the column we need to write to
+        if (::isnan(jOutF) || (jOutF < 0))     // negative index means gap
+            continue;
+        size_t jOut = (size_t)jOutF;
+
+        auto start = SecondaryIndexLocation()[jOut];
+        auto end = SecondaryIndexLocation()[jOut + 1];
+        for (auto p = start; p < end; p++, offset++)
+        {
+            GetUnCompIndex()[p] = a.GetUnCompIndex()[offset];
+            Buffer()[p] = a.Buffer()[offset] * alpha;
+        }
+    }
+
+    return *this;
+}
+
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::Print(const char* matrixName) const
 {
@ -587,13 +645,7 @@ void CPUSparseMatrix<ElemType>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYP
 }

 template <class ElemType>
-ElemType* CPUSparseMatrix<ElemType>::Data() const
-{
-    return Buffer() + GetCompIndex()[m_sliceViewOffset];
-}
-
-template <class ElemType>
-ElemType* CPUSparseMatrix<ElemType>::Data() 
+ElemType* CPUSparseMatrix<ElemType>::Data()  const
 {
    return (Buffer() + 
        ((GetFormat() == matrixFormatSparseCSC || GetFormat() == matrixFormatSparseCSR) ? GetCompIndex()[m_sliceViewOffset] : 0));
@ -1340,20 +1392,12 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const

    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
-        return (ElemType) dasum((int) this->NzCount(), reinterpret_cast<double*>(Data()), 1);
-#else
        return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast<double*>(Data()), 1);
-#endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
-        return sasum((int) this->NzCount(), reinterpret_cast<float*>(Data()), 1);
-#else
        return cblas_sasum((int) this->NzCount(), reinterpret_cast<float*>(Data()), 1);
-#endif
    }
 }

@ -1495,7 +1539,6 @@ template void CPUSparseMatrix<char>::SetValue(size_t, size_t, char);
 template void CPUSparseMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
 //template void CPUSparseMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
 template char* CPUSparseMatrix<char>::Data() const;
-template char* CPUSparseMatrix<char>::Data();
 template void CPUSparseMatrix<char>::Reset(void);
 template void CPUSparseMatrix<char>::Resize(const size_t, const size_t, const size_t, const bool);
 template void CPUSparseMatrix<char>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, bool);
@ -1518,7 +1561,6 @@ template void CPUSparseMatrix<short>::SetValue(size_t, size_t, short);
 template void CPUSparseMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
 //template void CPUSparseMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
 template short* CPUSparseMatrix<short>::Data() const;
-template short* CPUSparseMatrix<short>::Data();
 template void CPUSparseMatrix<short>::Reset(void);
 template void CPUSparseMatrix<short>::Resize(const size_t, const size_t, const size_t, const bool);
 template void CPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, bool);
--- a/Source/Math/CPUSparseMatrix.h
+++ b/Source/Math/CPUSparseMatrix.h
@ -92,13 +92,13 @@ public:
    void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val);

    CPUSparseMatrix<ElemType>& DoGatherColumnsOf(ElemType beta, const CPUMatrix<ElemType>& idx, const CPUSparseMatrix<ElemType>& a, ElemType alpha);
+    CPUSparseMatrix<ElemType>& DoScatterColumnsOf(ElemType beta, const CPUMatrix<ElemType>& idx, const CPUSparseMatrix<ElemType>& a, ElemType alpha);

    size_t BufferSize() const
    {
        return GetSizeAllocated() * sizeof(ElemType);
    }
    ElemType* Data() const;
-    ElemType* Data();
    inline size_t GetNumElemAllocated() const
    {
        return GetSizeAllocated();
@ -262,7 +262,8 @@ public:

    CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const
    {
-        return GetUnCompIndex() + GetCompIndex()[m_sliceViewOffset];
+        return (GetUnCompIndex() + 
+            ((GetFormat() == matrixFormatSparseCSC || GetFormat() == matrixFormatSparseCSR) ? GetCompIndex()[m_sliceViewOffset] : 0));
    } // this is the major index, row/col ids in CSC/CSR format

    size_t MajorIndexCount() const
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -237,7 +237,7 @@ std::pair<size_t, size_t> TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(
 // deviceId - the device on which the operation will take place
 void PrepareDevice(DEVICEID_TYPE deviceId)
 {
-    static DEVICEID_TYPE currentDevice = DEVICEID_NOTYETDETERMINED;
+    THREAD_LOCAL static DEVICEID_TYPE currentDevice = DEVICEID_NOTYETDETERMINED;
    // and if we last set the device to be this device we are good
    if (deviceId == currentDevice)
        return;
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -227,6 +227,5 @@
  <Target Name="CheckDependencies">
    <Error Condition="'$(MathLibrary)' == 'MKL' And '$(CNTK_MKL_PATH)' == ''" Text="CNTK custom MKL location not specified, see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
    <Error Condition="'$(MathLibrary)' == 'MKL' And !Exists('$(CNTKCustomMKLPath)')" Text="CNTK custom MKL not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
-    <Error Condition="'$(MathLibrary)' == 'ACML' And !Exists('$(ACML_PATH)')" Text="ACML not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#acml for instructions." />
  </Target>
 </Project>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -1081,7 +1081,7 @@ Matrix<ElemType>& Matrix<ElemType>::DoScatterColumnsOf(ElemType beta, const Matr
    DISPATCH_MATRIX_ON_FLAG(&a, this,
        { m_CPUMatrix->DoScatterColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUMatrix, alpha); },
        { m_GPUMatrix->DoScatterColumnsOf(beta, *idx.m_GPUMatrix, *a.m_GPUMatrix, alpha); },
-        { NOT_IMPLEMENTED; },
+        { m_CPUSparseMatrix->DoScatterColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUSparseMatrix, alpha); },
        { NOT_IMPLEMENTED; });

    return *this;
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@ -377,8 +377,8 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&

            // second, remove trailing slash if there is any
            // TODO: when gcc -v is 4.9 or greater, this should be: std::regex_replace(rootpath, L"\\/+$", wstring());
-            size_t stringPos = 0;
-            for (stringPos = rootpath.length() - 1; stringPos >= 0; stringPos--) 
+            int stringPos = 0;
+            for (stringPos = (int) (rootpath.length() - 1); stringPos >= 0; stringPos--) 
            {
                if (rootpath[stringPos] != L'/')
                {
@ -517,11 +517,11 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
        m_lattices->setverbosity(m_verbosity);

        // now get the frame source. This has better randomization and doesn't create temp files
-        bool minimizeReaderMemoryFootprint = readerConfig(L"minimizeReaderMemoryFootprint", true);
-        m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, 
+        bool useMersenneTwisterRand = readerConfig(L"useMersenneTwisterRand", false);
+        m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(useMersenneTwisterRand, infilesmulti, labelsmulti, m_featDims, m_labelDims,
                                                                         numContextLeft, numContextRight, randomize, 
                                                                         *m_lattices, m_latticeMap, m_frameMode, 
-                                                                         minimizeReaderMemoryFootprint, m_expandToUtt));
+                                                                         m_expandToUtt));
        m_frameSource->setverbosity(m_verbosity);
    }
    else if (EqualCI(readMethod, L"rollingWindow"))
--- a/Source/Readers/HTKMLFReader/utterancesourcemulti.h
+++ b/Source/Readers/HTKMLFReader/utterancesourcemulti.h
@ -12,7 +12,8 @@
 #include "latticearchive.h" // for reading HTK phoneme lattices (MMI training)
 #include "minibatchsourcehelpers.h"
 #include "minibatchiterator.h"
-#include "unordered_set"
+#include <unordered_set>
+#include <random>

 namespace msra { namespace dbn {

@ -38,6 +39,10 @@ class minibatchutterancesourcemulti : public minibatchsource
    // const std::vector<unique_ptr<latticesource>> &lattices;
    const latticesource &lattices;

+    // Flag indicating whether to use Mersenne Twister random generator.
+    bool m_useMersenneTwister;
+    std::mt19937_64 m_rng;
+
    // std::vector<latticesource> lattices;
    // word-level transcripts (for MMI mode when adding best path to lattices)
    const map<wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts; // (used for getting word-level transcripts)
@ -413,6 +418,7 @@ class minibatchutterancesourcemulti : public minibatchsource
        // When true we use a rolling window of randomized framerefs to minimize memory
        // footprint, instead of using a large vector listing all frames in the training corpus
        // Functionally, the 2 methods are identical.
+        // When it is true, we also use Mersenne Twister random generator for randomization.
        const bool m_minimizeMemoryFootprint;

        // [globalt-sweepts] -> (chunk, utt, frame) lookup table for randomized frames  --this can be REALLY big!
@ -429,6 +435,10 @@ class minibatchutterancesourcemulti : public minibatchsource
        size_t m_currentRangeEndChunkIdx;
        size_t m_nextFramePosNotYetRandomized;

+        // If m_minimizeMemoryFootprint is true, Mersenne Twister is used for randomization
+        // because rand has problems in distributed case.
+        std::mt19937_64 m_rng;
+
    public:
        framerandomizer(const std::vector<std::vector<chunk>>& randomizedChunks, bool minimizeMemoryFootprint)
            : m_randomizedChunks(randomizedChunks), m_minimizeMemoryFootprint(minimizeMemoryFootprint), m_currentRangeBeginChunkIdx(0), m_currentRangeEndChunkIdx(0), m_nextFramePosNotYetRandomized(0)
@ -496,7 +506,9 @@ class minibatchutterancesourcemulti : public minibatchsource

                for (;;) // (randomization retry loop)
                {
-                    size_t tswap = Microsoft::MSR::CNTK::rand(postbegin, postend); // random frame position within allowed range
+                    size_t tswap = m_minimizeMemoryFootprint ?
+                        Microsoft::MSR::CNTK::RandMT(postbegin, postend, m_rng) :
+                        Microsoft::MSR::CNTK::rand(postbegin, postend); // random frame position within allowed range
                    // We want to swap 't' to 'tswap' and 'tswap' to 't'.
                    //  - Both may have been swapped before.
                    //  - Both must stay within the randomization window of their respective position.
@ -542,11 +554,11 @@ class minibatchutterancesourcemulti : public minibatchsource

        void reset(unsigned int randSeed)
        {
-            srand(randSeed);
            size_t sweepts = m_randomizedChunks[0][0].globalts;
            size_t totalFrames = m_randomizedChunks[0].back().globalte() - sweepts;
            if (m_minimizeMemoryFootprint)
            {
+                m_rng.seed(randSeed);
                m_randomizedframerefsWindow.clear();
                m_currentRangeBeginChunkIdx = m_randomizedChunks[0][0].windowbegin;
                m_currentRangeEndChunkIdx = m_currentRangeBeginChunkIdx;
@ -554,6 +566,7 @@ class minibatchutterancesourcemulti : public minibatchsource
            }
            else
            {
+                srand(randSeed + 1);
                if (m_randomizedframerefs.size() != totalFrames)
                    m_randomizedframerefs.resize(totalFrames);

@ -866,10 +879,11 @@ public:
    // constructor
    // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
    // This mode requires utterances with time stamps.
-    minibatchutterancesourcemulti(const std::vector<std::vector<wstring>> &infiles, const std::vector<map<wstring, std::vector<msra::asr::htkmlfentry>>> &labels,
+    minibatchutterancesourcemulti(bool useMersenneTwister, const std::vector<std::vector<wstring>> &infiles, const std::vector<map<wstring, std::vector<msra::asr::htkmlfentry>>> &labels,
                                  std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange,
-                                  const latticesource &lattices, const map<wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts, const bool framemode, bool minimizeMemoryFootprint, std::vector<bool> expandToUtt)
-                                  : vdim(vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod(0), featdim(0), randomizationrange(randomizationrange), currentsweep(SIZE_MAX), lattices(lattices), allwordtranscripts(allwordtranscripts), framemode(framemode), chunksinram(0), timegetbatch(0), verbosity(2), m_generatePhoneBoundaries(!lattices.empty()), m_frameRandomizer(randomizedchunks, minimizeMemoryFootprint), expandToUtt(expandToUtt)
+                                  const latticesource &lattices, const map<wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts, const bool framemode, std::vector<bool> expandToUtt)
+                                  : vdim(vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod(0), featdim(0), randomizationrange(randomizationrange), currentsweep(SIZE_MAX), lattices(lattices), allwordtranscripts(allwordtranscripts), framemode(framemode), chunksinram(0), timegetbatch(0), verbosity(2), m_generatePhoneBoundaries(!lattices.empty()), m_frameRandomizer(randomizedchunks, useMersenneTwister), expandToUtt(expandToUtt),
+                                    m_useMersenneTwister(useMersenneTwister)
    // [v-hansu] change framemode (lattices.empty()) into framemode (false) to run utterance mode without lattice
    // you also need to change another line, search : [v-hansu] comment out to run utterance mode without lattice
    {
@ -1251,8 +1265,16 @@ private:
                randomizedchunkrefs[i].push_back(allchunks[i].begin() + j);
            assert(randomizedchunkrefs[i].size() == allchunks[i].size());

-            // note that sincew randomshuffle() uses sweep as seed, this will keep the randomization common across all feature streams
-            randomshuffle(randomizedchunkrefs[i], sweep); // bring into random order (with random seed depending on sweep)
+            if (m_useMersenneTwister)
+            {
+                m_rng.seed((unsigned long)sweep);
+                Microsoft::MSR::CNTK::RandomShuffleMT(randomizedchunkrefs[i], m_rng); // bring into random order (with random seed depending on sweep)
+            }
+            else
+            {
+                // note that sincew randomshuffle() uses sweep as seed, this will keep the randomization common across all feature streams
+                randomshuffle(randomizedchunkrefs[i], sweep); // bring into random order (with random seed depending on sweep)
+            }
        }

        // place them onto the global timeline -> randomizedchunks[]
@ -1348,7 +1370,7 @@ private:
            // check we got those setup right

            // we now randomly shuffle randomizedutterancerefs[pos], while considering the constraints of what chunk range needs to be in memory
-            srand((unsigned int) sweep + 1);
+            m_useMersenneTwister ? m_rng.seed((unsigned long)sweep) : srand((unsigned int)sweep + 1);
            for (size_t i = 0; i < randomizedutterancerefs.size(); i++)
            {
                // get valid randomization range, expressed in chunks
@ -1364,7 +1386,9 @@ private:
                for (;;)
                {
                    // pick a random location
-                    const size_t j = Microsoft::MSR::CNTK::rand(posbegin, posend); // a random number within the window
+                    const size_t j = m_useMersenneTwister ?
+                        Microsoft::MSR::CNTK::RandMT(posbegin, posend, m_rng) :
+                        Microsoft::MSR::CNTK::rand(posbegin, posend); // a random number within the window
                    if (i == j)
                        break; // the random gods say "this one points to its original position"... nothing wrong about that, but better not try to swap

@ -1416,7 +1440,7 @@ private:
        }
        else // frame mode
        {
-            m_frameRandomizer.reset((unsigned int)sweep + 1);
+            m_frameRandomizer.reset((unsigned int)sweep);
        }

        return sweep;
--- a/Source/Readers/ImageReader/ByteReader.h
+++ b/Source/Readers/ImageReader/ByteReader.h
@ -21,7 +21,7 @@ public:
    ByteReader() = default;
    virtual ~ByteReader() = default;

-    virtual void Register(size_t seqId, const std::string& path) = 0;
+    virtual void Register(const std::map<std::string, size_t>& sequences) = 0;
    virtual cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) = 0;

    DISABLE_COPY_AND_MOVE(ByteReader);
@ -30,7 +30,7 @@ public:
 class FileByteReader : public ByteReader
 {
 public:
-    void Register(size_t, const std::string&) override {}
+    void Register(const std::map<std::string, size_t>&) override {}
    cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) override;
 };

@ -40,7 +40,7 @@ class ZipByteReader : public ByteReader
 public:
    ZipByteReader(const std::string& zipPath);

-    void Register(size_t seqId, const std::string& path) override;
+    void Register(const std::map<std::string, size_t>& sequences) override;
    cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) override;

 private:
--- a/Source/Readers/ImageReader/ImageDataDeserializer.cpp
+++ b/Source/Readers/ImageReader/ImageDataDeserializer.cpp
@ -13,6 +13,7 @@
 #include "ImageConfigHelper.h"
 #include "StringUtil.h"
 #include "ConfigUtil.h"
+#include "TimerUtility.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -135,6 +136,7 @@ ImageDataDeserializer::ImageDataDeserializer(CorpusDescriptorPtr corpus, const C
    }

    string precision = (ConfigValue)config("precision", "float");
+    m_verbosity = config(L"verbosity", 0);

    // Feature stream.
    ConfigParameters featureSection = inputs(featureNames[0]);
@ -144,6 +146,7 @@ ImageDataDeserializer::ImageDataDeserializer(CorpusDescriptorPtr corpus, const C
    features->m_storageType = StorageType::dense;
    features->m_elementType = AreEqualIgnoreCase(precision, "float") ? ElementType::tfloat : ElementType::tdouble;
    m_streams.push_back(features);
+    m_featureElementType = features->m_elementType;

    // Label stream.
    ConfigParameters label = inputs(labelNames[0]);
@ -179,6 +182,8 @@ ImageDataDeserializer::ImageDataDeserializer(const ConfigParameters& config)
    const auto& label = m_streams[configHelper.GetLabelStreamId()];
    const auto& feature = m_streams[configHelper.GetFeatureStreamId()];

+    m_verbosity = config(L"verbosity", 0);
+
    // Expect data in HWC.
    ImageDimensions dimensions(*feature->m_sampleLayout, configHelper.GetDataFormat());
    feature->m_sampleLayout = std::make_shared<TensorShape>(dimensions.AsTensorShape(HWC));
@ -240,9 +245,13 @@ void ImageDataDeserializer::CreateSequenceDescriptions(CorpusDescriptorPtr corpu
    size_t curId = 0;
    std::string line;
    PathReaderMap knownReaders;
+    ReaderSequenceMap readerSequences;
    ImageSequenceDescription description;
    description.m_numberOfSamples = 1;

+    Timer timer;
+    timer.Start();
+
    auto& stringRegistry = corpus->GetStringRegistry();
    for (size_t lineIndex = 0; std::getline(mapFile, line); ++lineIndex)
    {
@ -296,9 +305,20 @@ void ImageDataDeserializer::CreateSequenceDescriptions(CorpusDescriptorPtr corpu

            m_keyToSequence[description.m_key.m_sequence] = m_imageSequences.size();
            m_imageSequences.push_back(description);
-            RegisterByteReader(description.m_id, description.m_path, knownReaders);
+            RegisterByteReader(description.m_id, description.m_path, knownReaders, readerSequences);
        }
    }
+
+    for (auto& reader : knownReaders)
+    {
+        reader.second->Register(readerSequences[reader.first]);
+    }
+
+    timer.Stop();
+    if (m_verbosity > 1)
+    {
+        fprintf(stderr, "ImageDeserializer: Read information about %d images in %.6g seconds\n", (int)m_imageSequences.size(), timer.ElapsedSeconds());
+    }
 }

 ChunkPtr ImageDataDeserializer::GetChunk(ChunkIdType chunkId)
@ -307,7 +327,7 @@ ChunkPtr ImageDataDeserializer::GetChunk(ChunkIdType chunkId)
    return std::make_shared<ImageChunk>(sequenceDescription, *this);
 }

-void ImageDataDeserializer::RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders)
+void ImageDataDeserializer::RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders, ReaderSequenceMap& readerSequences)
 {
    assert(!path.empty());

@ -330,16 +350,19 @@ void ImageDataDeserializer::RegisterByteReader(size_t seqId, const std::string&
    {
        reader = std::make_shared<ZipByteReader>(containerPath);
        knownReaders[containerPath] = reader;
+        readerSequences[containerPath] = std::map<std::string, size_t>();
    }
    else
    {
        reader = (*r).second;
    }
-    reader->Register(seqId, itemPath);
+
+    readerSequences[containerPath][itemPath] = seqId;
    m_readers[seqId] = reader;
 #else
    UNUSED(seqId);
    UNUSED(knownReaders);
+    UNUSED(readerSequences);
    RuntimeError("The code is built without zip container support. Only plain image files are supported.");
 #endif
 }
--- a/Source/Readers/ImageReader/ImageDataDeserializer.h
+++ b/Source/Readers/ImageReader/ImageDataDeserializer.h
@ -72,7 +72,8 @@ private:

    // Not using nocase_compare here as it's not correct on Linux.
    using PathReaderMap = std::unordered_map<std::string, std::shared_ptr<ByteReader>>;
-    void RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders);
+    using ReaderSequenceMap = std::map<std::string, std::map<std::string, size_t>>;
+    void RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders, ReaderSequenceMap& readerSequences);
    cv::Mat ReadImage(size_t seqId, const std::string& path, bool grayscale);

    // REVIEW alexeyk: can potentially use vector instead of map. Need to handle default reader and resizing though.
@ -80,6 +81,7 @@ private:
    SeqReaderMap m_readers;

    FileByteReader m_defaultReader;
+    int m_verbosity;
 };

 }}}
--- a/Source/Readers/ImageReader/ZipByteReader.cpp
+++ b/Source/Readers/ImageReader/ZipByteReader.cpp
@ -44,16 +44,46 @@ ZipByteReader::ZipPtr ZipByteReader::OpenZip()
    });
 }

-void ZipByteReader::Register(size_t seqId, const std::string& path)
+void ZipByteReader::Register(const std::map<std::string, size_t>& sequences)
 {
    auto zipFile = m_zips.pop_or_create([this]() { return OpenZip(); });
    zip_stat_t stat;
    zip_stat_init(&stat);
-    int err = zip_stat(zipFile.get(), path.c_str(), 0, &stat);
-    if (ZIP_ER_OK != err)
-        RuntimeError("Failed to get file info of %s, zip library error: %s", path.c_str(), GetZipError(err).c_str());
-    m_seqIdToIndex[seqId] = std::make_pair(stat.index, stat.size);
+
+    size_t numberOfEntries = 0;
+    size_t numEntries = zip_get_num_entries(zipFile.get(), 0);
+    for (size_t i = 0; i < numEntries; ++i) {
+        int err = zip_stat_index(zipFile.get(), i, 0, &stat);
+        if (ZIP_ER_OK != err)
+            RuntimeError("Failed to get file info for index %d, zip library error: %s", (int)i, GetZipError(err).c_str());
+
+        auto sequenceId = sequences.find(std::string(stat.name));
+        if (sequenceId == sequences.end())
+        {
+            continue;
+        }
+        else
+        {
+            m_seqIdToIndex[sequenceId->second] = std::make_pair(stat.index, stat.size);
+            numberOfEntries++;
+        }
+    }
    m_zips.push(std::move(zipFile));
+
+    if (numberOfEntries != sequences.size())
+    {
+        // Not all sequences have been found. Let's print them out and throw.
+        for (const auto& s : sequences)
+        {
+            auto index = m_seqIdToIndex.find(s.second);
+            if (index == m_seqIdToIndex.end())
+            {
+                fprintf(stderr, "Sequence %s is not found in container %s.\n", s.first.c_str(), m_zipPath.c_str());
+            }
+        }
+
+        RuntimeError("Cannot retrieve image data for some sequences. For more detail, please see the log file.");
+    }
 }

 cv::Mat ZipByteReader::Read(size_t seqId, const std::string& path, bool grayscale)
--- a/Source/Readers/KaldiReaderReadme
+++ b/Source/Readers/KaldiReaderReadme
@ -22,17 +22,10 @@
 == Preeliminaries == 

 To build the cpu version, you have to install intel MKL blas library
-or ACML library first. Note that ACML is free, whereas MKL may not be.

 for MKL:
 1. Download from https://software.intel.com/en-us/intel-mkl

-for ACML:
-1. Download from
-http://developer.amd.com/tools-and-sdks/archive/amd-core-math-library-acml/acml-downloads-resources/
-We have seen some problems with some versions of the library on Intel
-processors, but have had success with acml-5-3-1-ifort-64bit.tgz
-
 for Kaldi:
 1. In kaldi-trunk/tools/Makefile, uncomment # OPENFST_VERSION = 1.4.1, and
   re-install OpenFst using the makefile.
@ -54,8 +47,7 @@ build in the directory "build" type
 (For an in source build, just run configure in the $CNTK directory).

 You will see various options for configure, as well as their default
-values.  CNTK needs a CPU math directory, either acml or mkl.  If you
-do not specify one and both are available, acml will be used.  For GPU
+values.  CNTK needs a CPU math library (mkl). For GPU
 use, a cuda and gdk directory are also required.  Similary, to build
 the kaldi plugin a kaldi directory is required.  You may also specify
 whether you want a debug or release build, as well as add additional
--- a/Source/Readers/LMSequenceReader/SequenceParser.h
+++ b/Source/Readers/LMSequenceReader/SequenceParser.h
@ -171,6 +171,12 @@ public:
    // setup all the state variables and state tables for state machine
    void Init();

+    // convenience function for setting the flags
+    inline unsigned int SetSequenceFlags()
+    {
+        return (m_beginSequence ? seqFlagStartLabel : 0) | (m_endSequence ? seqFlagStopLabel : 0) | seqFlagLineBreak;
+    }
+
    // Parser destructor
    ~SequenceParser();

@ -334,8 +340,7 @@ public:
                case EndOfLine:
                    if (seqPos)
                    {
-                        SequencePosition sequencePos(numbers->size(), labels->size(),
-                                                     (m_beginSequence ? seqFlagStartLabel : 0) | (m_endSequence ? seqFlagStopLabel : 0) | seqFlagLineBreak);
+                        SequencePosition sequencePos(numbers->size(), labels->size(), SetSequenceFlags());
                        // add a sequence element to the list
                        seqPos->push_back(sequencePos);
                        sequencePositionLast = sequencePos;
@ -429,8 +434,7 @@ public:
        // this could probably be fixed by taking another pass through the loop above, but this is easier
        if (seqPos)
        {
-            SequencePosition sequencePos(numbers->size(), labels->size(),
-                                         m_beginSequence ? seqFlagStartLabel : 0 | m_endSequence ? seqFlagStopLabel : 0 | seqFlagLineBreak);
+            SequencePosition sequencePos(numbers->size(), labels->size(), SetSequenceFlags());
            // add the final sequence element if needed
            if (!(sequencePos.labelPos == sequencePositionLast.labelPos && sequencePos.numberPos == sequencePositionLast.numberPos))
            {
@ -510,6 +514,7 @@ public:
    using SequenceParser<NumType, LabelType>::m_totalNumbersConverted;
    using SequenceParser<NumType, LabelType>::m_dimLabelsOut;
    using SequenceParser<NumType, LabelType>::m_bufferStart;
+    using SequenceParser<NumType, LabelType>::SetSequenceFlags;
    LMSequenceParser()
    {
        mFile = nullptr;
@ -594,8 +599,7 @@ public:
                labels->push_back(std::move(vstr[i])); // TODO: is this an entire sequence, or multiple columns describing a single token?

            // add a sequence element to the list
-            SequencePosition sequencePos(numbers->size(), labels->size(),
-                                         m_beginSequence ? seqFlagStartLabel : 0 | m_endSequence ? seqFlagStopLabel : 0 | seqFlagLineBreak);
+            SequencePosition sequencePos(numbers->size(), labels->size(), SetSequenceFlags());
            seqPos->push_back(sequencePos);

            lineCount++;
--- a/Source/Readers/ReaderLib/BlockRandomizer.cpp
+++ b/Source/Readers/ReaderLib/BlockRandomizer.cpp
@ -80,8 +80,8 @@ void BlockRandomizer::StartEpoch(const EpochConfiguration& config)

 #ifdef _DEBUG
    size_t epochStartFrame = config.m_epochIndex * m_epochSize;
-    fprintf(stderr, "BlockRandomizer::StartEpoch: epoch %" PRIu64 ": frames [%" PRIu64 "..%" PRIu64 "] (first sequence at sample %" PRIu64 "), data subset %" PRIu64 " of %" PRIu64 "\n",
-            config.m_epochIndex,
+    fprintf(stderr, "BlockRandomizer::StartEpoch: epoch %" PRIu64 ": samples [%" PRIu64 "..%" PRIu64 "] (first sequence at sample %" PRIu64 "), worker rank %" PRIu64 ", total workers %" PRIu64 "\n",
+            config.m_epochIndex + 1,
            epochStartFrame,
            epochStartFrame + m_epochSize,
            m_globalSamplePosition,
@ -107,7 +107,7 @@ void BlockRandomizer::PrepareNewSweepIfNeeded(size_t samplePosition)
        m_chunkRandomizer->Randomize((unsigned int)m_sweep);

        // Resetting sequence randomizer.
-        m_sequenceRandomizer->Reset(m_sweep + 1);
+        m_sequenceRandomizer->Reset(m_sweep);
        m_lastSeenChunkId = CHUNKID_MAX;
    }
 }
@ -138,8 +138,8 @@ Sequences BlockRandomizer::GetNextSequences(size_t sampleCount)

    if (m_verbosity >= Debug)
        fprintf(stderr, "BlockRandomizer::GetNextSequences(): getting %" PRIu64 " out of %" PRIu64 " sequences for %" PRIu64 " requested samples in sweep %" PRIu64 "\n",
-            sequences.size(),
            decimated.size(),
+            sequences.size(),
            sampleCount,
            m_sweep);

--- a/Source/Readers/ReaderLib/ChunkRandomizer.cpp
+++ b/Source/Readers/ReaderLib/ChunkRandomizer.cpp
@ -10,25 +10,6 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-    // NOTE: This is an old code, used for legacy randomization to make sure we preserve the same behavior for the tests.
-    // TODO: Deprecate when the new randomizer is in place.
-    template <typename TVector>
-    void RandomShuffle(TVector& v, size_t randomSeed)
-    {
-        if (v.size() > RAND_MAX * static_cast<size_t>(RAND_MAX))
-        {
-            RuntimeError("RandomShuffle: too large set: need to change to different random generator!");
-        }
-
-        srand(static_cast<unsigned int>(randomSeed));
-        foreach_index(currentLocation, v)
-        {
-            // Pick a random location a location and swap with current
-            const size_t randomLocation = rand(0, v.size());
-            std::swap(v[currentLocation], v[randomLocation]);
-        }
-    }
-
    ChunkRandomizer::ChunkRandomizer(IDataDeserializerPtr deserializer, size_t randomizationRangeInSamples, bool legacy) :
        m_deserializer(deserializer), m_legacy(legacy), m_randomizationRangeInSamples(randomizationRangeInSamples)
    {
@ -52,15 +33,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            randomizedChunkIndices.push_back(i);
        }

-        if (m_legacy)
-        {
-            RandomShuffle(randomizedChunkIndices, seed);
-        }
-        else
-        {
-            std::mt19937 m_rng(static_cast<int>(seed));
-            std::shuffle(randomizedChunkIndices.begin(), randomizedChunkIndices.end(), m_rng);
-        }
+        m_rng.seed(seed);
+        RandomShuffleMT(randomizedChunkIndices, m_rng);

        // Place randomized chunks on the timeline
        m_randomizedChunks.clear();
--- a/Source/Readers/ReaderLib/ChunkRandomizer.h
+++ b/Source/Readers/ReaderLib/ChunkRandomizer.h
@ -7,6 +7,7 @@

 #include <vector>
 #include "DataDeserializer.h"
+#include <random>

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -68,6 +69,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        bool m_legacy;
        // Randomization range in samples.
        size_t m_randomizationRangeInSamples;
+
+        std::mt19937_64 m_rng;
    };

    typedef std::shared_ptr<ChunkRandomizer> ChunkRandomizerPtr;
--- a/Source/Readers/ReaderLib/SequenceRandomizer.cpp
+++ b/Source/Readers/ReaderLib/SequenceRandomizer.cpp
@ -45,7 +45,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // Resets the current sweep according to the randomization seed provided.
    void SequenceRandomizer::Reset(size_t randSeed)
    {
-        srand((unsigned int)randSeed);
+        m_rng.seed((unsigned long)randSeed);

        m_sequenceWindow.clear();
        m_chunkWindow.clear();
@ -197,7 +197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (;;)
            {
                // Pick a sequence position from [posBegin, posEnd)
-                const size_t j = rand(posBegin, posEnd);
+                const size_t j = RandMT(posBegin, posEnd, m_rng);

                // Pick up j sequence.
                ChunkIdType jChunkIndex = GetChunkIndexForSequencePosition(j);
--- a/Source/Readers/ReaderLib/SequenceRandomizer.h
+++ b/Source/Readers/ReaderLib/SequenceRandomizer.h
@ -11,6 +11,7 @@
 #include "DataDeserializer.h"
 #include "ChunkRandomizer.h"
 #include <deque>
+#include <random>

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -164,6 +165,8 @@ private:

    // General configuration
    int m_verbosity;
+
+    std::mt19937_64 m_rng;
 };

 typedef std::shared_ptr<SequenceRandomizer> SequenceRandomizerPtr;
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -40,30 +40,10 @@ template SGD<double>::SGD(const ScriptableObjects::IConfigRecord&);
 // -----------------------------------------------------------------------

 template <class ElemType>
-void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn, DEVICEID_TYPE deviceId,
+void SGD<ElemType>::Train(shared_ptr<ComputationNetwork> net, DEVICEID_TYPE deviceId,
                          IDataReader* trainSetDataReader,
-                          IDataReader* validationSetDataReader,
-                          const bool makeMode)
+                          IDataReader* validationSetDataReader, int startEpoch, bool loadNetworkFromCheckpoint)
 {
-    // determine which epoch to start with, including recovering a checkpoint if any and 'makeMode' enabled
-    int startEpoch = DetermineStartEpoch(makeMode);
-    if (startEpoch == m_maxEpochs)
-    {
-        LOGPRINTF(stderr, "No further training is necessary.\n");
-        return;
-    }
-
-    wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
-    bool loadNetworkFromCheckpoint = startEpoch >= 0;
-    fprintf(stderr, "\n");
-    if (loadNetworkFromCheckpoint)
-        LOGPRINTF(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
-    else
-        LOGPRINTF(stderr, "Creating virgin network.\n");
-
-    // create or load from checkpoint
-    shared_ptr<ComputationNetwork> net = !loadNetworkFromCheckpoint ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
-
    // log the device we are computing on
    LOGPRINTF(stderr, "%s model with %d nodes", loadNetworkFromCheckpoint ? "Loaded" : "Created", (int)net->GetTotalNumberOfNodes());
    if (net->GetDeviceId() < 0)
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -110,6 +110,8 @@ struct SGDParams : public ScriptableObjects::Object

    // SGDParams(SGDParams&&) = default; // (does not compile in VS 2013; not critical)

+    size_t GetMaxEpochs() { return m_maxEpochs; }
+
 protected:
    // learning rate per sample provided outside
    floatargvector m_learningRatesParam;
@ -342,10 +344,9 @@ public:
            m_parallelizationMethod = ParallelizationMethod::none;
    }

-    void Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn, DEVICEID_TYPE deviceId,
+    void Train(shared_ptr<ComputationNetwork> net, DEVICEID_TYPE deviceId,
               IDataReader* trainSetDataReader,
-               IDataReader* validationSetDataReader,
-               const bool makeMode = true);
+               IDataReader* validationSetDataReader, int startEpoch, bool loadNetworkFromCheckpoint);
    void Adapt(wstring origModelFileName, wstring refNodeName,
               IDataReader* trainSetDataReader,
               IDataReader* validationSetDataReader,
@ -483,6 +484,10 @@ public:
                               const double L1RegWeight,
                               const bool needAveMultiplier,
                               const bool useNesterovMomentum);
+    // return -1 if nothing exists
+    int DetermineStartEpoch(const bool makeMode);
+
+    wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false);

 protected:
    // UpdateWeights - update the weights in
@ -517,10 +522,6 @@ protected:
                            /*out*/ size_t& minibatchSize);

    wstring GetCheckPointFileNameForEpoch(const int epoch);
-    wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false);
-
-    // return -1 if nothing exists
-    int DetermineStartEpoch(const bool makeMode);

    GradientsUpdateType GradUpdateType() const
    {
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@ -180,7 +180,7 @@ public:
                    m_gradHeader.reset(DistGradHeader::Create(evalNodes.size()), [](DistGradHeader* ptr) {
                        DistGradHeader::Destroy(ptr);
                    });
-                    m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false, m_traceLevel);
+                    m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, 0 /*syncStatsTrace*/);
                }

                m_gradHeader->numEvalNode = evalNodes.size();
--- a/Tests/EndToEndTests/CNTKv2Library/UnitTests/run-test
+++ b/Tests/EndToEndTests/CNTKv2Library/UnitTests/run-test
@ -21,6 +21,7 @@ mkdir $DataDir
 cp -R $DataSourceDir/MNIST/v0/Train-28x28_cntk_text.txt $DataDir || exit $?
 cp -R $DataSourceDir/CIFAR/v0/cifar-10-batches-py $DataDir || exit $?
 cp -R $TEST_DIR/../../../../Examples/Other/Simple2d/Data/SimpleDataTrain_cntk_text.txt $DataDir || exit $?
+cp -R $TEST_DIR/../../Text/SequenceClassification/Data/Train.ctf $DataDir || exit $?

 pushd $DataDir

--- a/Tests/EndToEndTests/Examples/Image/MNIST/01_OneHidden/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/01_OneHidden/baseline.linux.txt
@ -272,8 +272,8 @@ Post-processing network...

 4 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
-	errTop1 = ClassificationError()
+	err = ErrorPrediction()
+	errTop1 = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 17 nodes to process in pass 1.
@ -292,9 +292,9 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 200], [200 x 1 x *] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating --> unnamed81 = LearnableParameter() :  -> [1 x 1]
-Validating --> errTop1 = ClassificationError (labels, ol.z, unnamed81) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]
+Validating --> errTop1 = ErrorPrediction (labels, ol.z, unnamed81) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]

 Validating network. 9 nodes to process in pass 2.

@ -314,8 +314,8 @@ Post-processing network complete.

 05/13/2016 15:10:02: Evaluation criterion node(s):

-05/13/2016 15:10:02: 	errTop1 = ClassificationError
-05/13/2016 15:10:02: 	err = ClassificationError
+05/13/2016 15:10:02: 	errTop1 = ErrorPrediction
+05/13/2016 15:10:02: 	err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -390,8 +390,8 @@ Post-processing network...

 4 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
-	errTop1 = ClassificationError()
+	err = ErrorPrediction()
+	errTop1 = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 17 nodes to process in pass 1.
@ -410,9 +410,9 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 200], [200 x 1 x *1] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating --> unnamed81 = LearnableParameter() :  -> [1 x 1]
-Validating --> errTop1 = ClassificationError (labels, ol.z, unnamed81) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]
+Validating --> errTop1 = ErrorPrediction (labels, ol.z, unnamed81) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]

 Validating network. 9 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/MNIST/01_OneHidden/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/01_OneHidden/baseline.windows.txt
@ -270,8 +270,8 @@ Post-processing network...

 4 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
-	errTop1 = ClassificationError()
+	err = ErrorPrediction()
+	errTop1 = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 17 nodes to process in pass 1.
@ -290,9 +290,9 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 200], [200 x 1 x *] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating --> unnamed81 = LearnableParameter() :  -> [1 x 1]
-Validating --> errTop1 = ClassificationError (labels, ol.z, unnamed81) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]
+Validating --> errTop1 = ErrorPrediction (labels, ol.z, unnamed81) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]

 Validating network. 9 nodes to process in pass 2.

@ -312,8 +312,8 @@ Post-processing network complete.

 05/13/2016 08:15:53: Evaluation criterion node(s):

-05/13/2016 08:15:53: 	errTop1 = ClassificationError
-05/13/2016 08:15:53: 	err = ClassificationError
+05/13/2016 08:15:53: 	errTop1 = ErrorPrediction
+05/13/2016 08:15:53: 	err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -388,8 +388,8 @@ Post-processing network...

 4 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
-	errTop1 = ClassificationError()
+	err = ErrorPrediction()
+	errTop1 = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 17 nodes to process in pass 1.
@ -408,9 +408,9 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 200], [200 x 1 x *1] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating --> unnamed81 = LearnableParameter() :  -> [1 x 1]
-Validating --> errTop1 = ClassificationError (labels, ol.z, unnamed81) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]
+Validating --> errTop1 = ErrorPrediction (labels, ol.z, unnamed81) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]

 Validating network. 9 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/MNIST/02_Convolution/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/02_Convolution/baseline.linux.txt
@ -284,7 +284,7 @@ Post-processing network...

 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 27 nodes to process in pass 1.
@ -315,7 +315,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x 1 x *] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]

 Validating network. 16 nodes to process in pass 2.

@ -343,7 +343,7 @@ Post-processing network complete.

 05/13/2016 15:10:11: Evaluation criterion node(s):

-05/13/2016 15:10:11: 	err = ClassificationError
+05/13/2016 15:10:11: 	err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -429,7 +429,7 @@ Post-processing network...

 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 27 nodes to process in pass 1.
@ -460,7 +460,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x 1 x *1] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]

 Validating network. 16 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/MNIST/02_Convolution/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/02_Convolution/baseline.windows.txt
@ -282,7 +282,7 @@ Post-processing network...

 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 27 nodes to process in pass 1.
@ -313,7 +313,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x 1 x *] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]

 Validating network. 16 nodes to process in pass 2.

@ -341,7 +341,7 @@ Post-processing network complete.

 05/13/2016 08:16:18: Evaluation criterion node(s):

-05/13/2016 08:16:18: 	err = ClassificationError
+05/13/2016 08:16:18: 	err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -427,7 +427,7 @@ Post-processing network...

 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 27 nodes to process in pass 1.
@ -458,7 +458,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x 1 x *1] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]

 Validating network. 16 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/MNIST/03_ConvBatchNorm/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/03_ConvBatchNorm/baseline.linux.txt
@ -287,7 +287,7 @@ Post-processing network...

 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 36 nodes to process in pass 1.
@ -329,7 +329,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x *] -> [10 x *]
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]

 Validating network. 16 nodes to process in pass 2.

@ -363,7 +363,7 @@ Post-processing network complete.

 05/13/2016 15:10:29: Evaluation criterion node(s):

-05/13/2016 15:10:29: 	err = ClassificationError
+05/13/2016 15:10:29: 	err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -462,7 +462,7 @@ Post-processing network...

 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 36 nodes to process in pass 1.
@ -502,7 +502,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x *1] -> [10 x *1]
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]

 Validating network. 16 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/MNIST/03_ConvBatchNorm/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/03_ConvBatchNorm/baseline.windows.txt
@ -285,7 +285,7 @@ Post-processing network...

 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 36 nodes to process in pass 1.
@ -327,7 +327,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x *] -> [10 x *]
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]

 Validating network. 16 nodes to process in pass 2.

@ -361,7 +361,7 @@ Post-processing network complete.

 05/13/2016 08:16:58: Evaluation criterion node(s):

-05/13/2016 08:16:58: 	err = ClassificationError
+05/13/2016 08:16:58: 	err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -460,7 +460,7 @@ Post-processing network...

 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()

 Validating network. 36 nodes to process in pass 1.
@ -500,7 +500,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x *1] -> [10 x *1]
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]

 Validating network. 16 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/baseline.linux.txt
@ -1,49 +1,62 @@
-=== Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/gpu/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/../../../../Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/Config/01_Conv.cntk currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10 OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu DeviceId=0 timestamping=true Train=[SGD=[maxEpochs=10]] Train=[SGD=[epochSize=100]] stderr=-
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 264172964 kB
+-------------------------------------------------------------------
+=== Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/gpu/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10 OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu DeviceId=0 timestamping=true Train=[SGD=[maxEpochs=10]] Train=[SGD=[epochSize=100]] stderr=-
 -------------------------------------------------------------------
 Build info: 

-		Built time: May 13 2016 14:50:25
-		Last modified date: Thu May 12 14:00:37 2016
+		Built time: Aug 16 2016 09:41:56
+		Last modified date: Fri Aug 12 07:32:43 2016
 		Build type: release
 		Build target: GPU
 		With 1bit-SGD: no
-		Math lib: acml
+		Math lib: mkl
 		CUDA_PATH: /usr/local/cuda-7.5
 		CUB_PATH: /usr/local/cub-1.4.1
 		CUDNN_PATH: /usr/local/cudnn-4.0
 		Build Branch: HEAD
-		Build SHA1: 35fadc316f045d843bbd9b85061250a959268787
-		Built by philly on d8dc82703b0f
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by philly on f67b30a647de
 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 -------------------------------------------------------------------
-Changed current directory to /tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-05/13/2016 15:10:47: Redirecting stderr to file -_Train_Test.log
-05/13/2016 15:10:47: -------------------------------------------------------------------
-05/13/2016 15:10:47: Build info: 
+Changed current directory to /tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+08/16/2016 10:50:36: Redirecting stderr to file -_Train_Test.log
+08/16/2016 10:50:36: -------------------------------------------------------------------
+08/16/2016 10:50:36: Build info: 

-05/13/2016 15:10:47: 		Built time: May 13 2016 14:50:25
-05/13/2016 15:10:47: 		Last modified date: Thu May 12 14:00:37 2016
-05/13/2016 15:10:47: 		Build type: release
-05/13/2016 15:10:47: 		Build target: GPU
-05/13/2016 15:10:47: 		With 1bit-SGD: no
-05/13/2016 15:10:47: 		Math lib: acml
-05/13/2016 15:10:47: 		CUDA_PATH: /usr/local/cuda-7.5
-05/13/2016 15:10:47: 		CUB_PATH: /usr/local/cub-1.4.1
-05/13/2016 15:10:47: 		CUDNN_PATH: /usr/local/cudnn-4.0
-05/13/2016 15:10:47: 		Build Branch: HEAD
-05/13/2016 15:10:47: 		Build SHA1: 35fadc316f045d843bbd9b85061250a959268787
-05/13/2016 15:10:47: 		Built by philly on d8dc82703b0f
-05/13/2016 15:10:47: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
-05/13/2016 15:10:47: -------------------------------------------------------------------
+08/16/2016 10:50:36: 		Built time: Aug 16 2016 09:41:56
+08/16/2016 10:50:36: 		Last modified date: Fri Aug 12 07:32:43 2016
+08/16/2016 10:50:36: 		Build type: release
+08/16/2016 10:50:36: 		Build target: GPU
+08/16/2016 10:50:36: 		With 1bit-SGD: no
+08/16/2016 10:50:36: 		Math lib: mkl
+08/16/2016 10:50:36: 		CUDA_PATH: /usr/local/cuda-7.5
+08/16/2016 10:50:36: 		CUB_PATH: /usr/local/cub-1.4.1
+08/16/2016 10:50:36: 		CUDNN_PATH: /usr/local/cudnn-4.0
+08/16/2016 10:50:36: 		Build Branch: HEAD
+08/16/2016 10:50:36: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 10:50:36: 		Built by philly on f67b30a647de
+08/16/2016 10:50:36: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+08/16/2016 10:50:36: -------------------------------------------------------------------
+08/16/2016 10:50:37: -------------------------------------------------------------------
+08/16/2016 10:50:37: GPU info:

-05/13/2016 15:10:47: Running on localhost at 2016/05/13 15:10:47
-05/13/2016 15:10:47: Command line: 
-/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/gpu/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/../../../../Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/Config/01_Conv.cntk  currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData  RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu  DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10  OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu  DeviceId=0  timestamping=true  Train=[SGD=[maxEpochs=10]]  Train=[SGD=[epochSize=100]]  stderr=-
+08/16/2016 10:50:37: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:50:37: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:50:37: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:50:37: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:50:37: -------------------------------------------------------------------
+
+08/16/2016 10:50:37: Running on localhost at 2016/08/16 10:50:37
+08/16/2016 10:50:37: Command line: 
+/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/gpu/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk  currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData  RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu  DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10  OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu  DeviceId=0  timestamping=true  Train=[SGD=[maxEpochs=10]]  Train=[SGD=[epochSize=100]]  stderr=-



-05/13/2016 15:10:47: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-05/13/2016 15:10:47: RootDir = "."
+08/16/2016 10:50:37: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:50:37: RootDir = "."
 ConfigDir = "$RootDir$"
 DataDir = "$RootDir$"
 OutputDir = "$RootDir$/Output"
@ -53,7 +66,6 @@ precision = "float"
 deviceId = 0
 imageLayout = "cudnn"
 initOnCPUOnly=true
-prefetch = "true"
 command = Train:Test
 modelPath = "$ModelDir$/01_Convolution"
 stderr = "$OutputDir$/01_Conv"
@ -86,7 +98,7 @@ Train = [
                format = "dense"
            ]
        ]
-    ]
+    ]    
 ]
 Test = [
    action = "test"
@ -104,42 +116,41 @@ Test = [
                format = "dense"
            ]
        ]
-    ]   
+    ]    
 ]
-currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
-DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
-OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
+OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
 DeviceId=0
 timestamping=true
 Train=[SGD=[maxEpochs=10]]
 Train=[SGD=[epochSize=100]]
 stderr=-

-05/13/2016 15:10:47: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:50:37: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<

-05/13/2016 15:10:47: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-05/13/2016 15:10:47: RootDir = "."
+08/16/2016 10:50:37: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:50:37: RootDir = "."
 ConfigDir = "."
 DataDir = "."
 OutputDir = "./Output"
-ModelDir = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models"
-ndlMacros = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl"
+ModelDir = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models"
+ndlMacros = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl"
 precision = "float"
 deviceId = 0
 imageLayout = "cudnn"
 initOnCPUOnly=true
-prefetch = "true"
 command = Train:Test
-modelPath = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution"
-stderr = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/01_Conv"
+modelPath = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution"
+stderr = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/01_Conv"
 traceLevel = 1
 numMBsToShowResult = 500
 Train = [
    action = "train"
     NDLNetworkBuilder = [
-        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl"
+        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl"
    ]
    SGD = [
        epochSize = 49984
@ -152,7 +163,7 @@ Train = [
    ]
    reader = [
        readerType = "CNTKTextFormatReader"
-        file = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Train_cntk_text.txt"
+        file = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Train_cntk_text.txt"
        input = [
            features = [
                dim = 3072
@ -163,14 +174,14 @@ Train = [
                format = "dense"
            ]
        ]
-    ]
+    ]    
 ]
 Test = [
    action = "test"
    minibatchSize = 16
    reader = [
        readerType = "CNTKTextFormatReader"
-        file = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Test_cntk_text.txt"
+        file = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Test_cntk_text.txt"
        input = [
            features = [
                dim = 3072
@ -181,45 +192,44 @@ Test = [
                format = "dense"
            ]
        ]
-    ]   
+    ]    
 ]
-currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
-DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
-OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
+OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
 DeviceId=0
 timestamping=true
 Train=[SGD=[maxEpochs=10]]
 Train=[SGD=[epochSize=100]]
 stderr=-

-05/13/2016 15:10:47: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:50:37: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<

-05/13/2016 15:10:47: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:50:37: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: 01_Conv.cntk:command=Train:Test
-configparameters: 01_Conv.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
-configparameters: 01_Conv.cntk:currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-configparameters: 01_Conv.cntk:DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+configparameters: 01_Conv.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
+configparameters: 01_Conv.cntk:currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+configparameters: 01_Conv.cntk:DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
 configparameters: 01_Conv.cntk:deviceId=0
 configparameters: 01_Conv.cntk:imageLayout=cudnn
 configparameters: 01_Conv.cntk:initOnCPUOnly=true
-configparameters: 01_Conv.cntk:ModelDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models
-configparameters: 01_Conv.cntk:modelPath=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution
-configparameters: 01_Conv.cntk:ndlMacros=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+configparameters: 01_Conv.cntk:ModelDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models
+configparameters: 01_Conv.cntk:modelPath=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution
+configparameters: 01_Conv.cntk:ndlMacros=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
 configparameters: 01_Conv.cntk:numMBsToShowResult=500
-configparameters: 01_Conv.cntk:OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+configparameters: 01_Conv.cntk:OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
 configparameters: 01_Conv.cntk:precision=float
-configparameters: 01_Conv.cntk:prefetch=true
 configparameters: 01_Conv.cntk:RootDir=.
-configparameters: 01_Conv.cntk:RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+configparameters: 01_Conv.cntk:RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
 configparameters: 01_Conv.cntk:stderr=-
 configparameters: 01_Conv.cntk:Test=[
    action = "test"
    minibatchSize = 16
    reader = [
        readerType = "CNTKTextFormatReader"
-        file = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Test_cntk_text.txt"
+        file = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Test_cntk_text.txt"
        input = [
            features = [
                dim = 3072
@ -230,7 +240,7 @@ configparameters: 01_Conv.cntk:Test=[
                format = "dense"
            ]
        ]
-    ]   
+    ]    
 ]

 configparameters: 01_Conv.cntk:timestamping=true
@ -238,7 +248,7 @@ configparameters: 01_Conv.cntk:traceLevel=1
 configparameters: 01_Conv.cntk:Train=[
    action = "train"
     NDLNetworkBuilder = [
-        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl"
+        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl"
    ]
    SGD = [
        epochSize = 49984
@ -251,7 +261,7 @@ configparameters: 01_Conv.cntk:Train=[
    ]
    reader = [
        readerType = "CNTKTextFormatReader"
-        file = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Train_cntk_text.txt"
+        file = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Train_cntk_text.txt"
        input = [
            features = [
                dim = 3072
@ -262,33 +272,57 @@ configparameters: 01_Conv.cntk:Train=[
                format = "dense"
            ]
        ]
-    ]
+    ]    
 ] [SGD=[maxEpochs=10]] [SGD=[epochSize=100]]

-05/13/2016 15:10:47: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/13/2016 15:10:47: Commands: Train Test
-05/13/2016 15:10:47: Precision = "float"
-05/13/2016 15:10:47: CNTKModelPath: /tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution
-05/13/2016 15:10:47: CNTKCommandTrainInfo: Train : 10
-05/13/2016 15:10:47: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 10
+08/16/2016 10:50:37: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:50:37: Commands: Train Test
+08/16/2016 10:50:37: Precision = "float"
+08/16/2016 10:50:37: CNTKModelPath: /tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution
+08/16/2016 10:50:37: CNTKCommandTrainInfo: Train : 10
+08/16/2016 10:50:37: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 10

-05/13/2016 15:10:47: ##############################################################################
-05/13/2016 15:10:47: #                                                                            #
-05/13/2016 15:10:47: # Action "train"                                                             #
-05/13/2016 15:10:47: #                                                                            #
-05/13/2016 15:10:47: ##############################################################################
+08/16/2016 10:50:37: ##############################################################################
+08/16/2016 10:50:37: #                                                                            #
+08/16/2016 10:50:37: # Action "train"                                                             #
+08/16/2016 10:50:37: #                                                                            #
+08/16/2016 10:50:37: ##############################################################################

-05/13/2016 15:10:47: CNTKCommandTrainBegin: Train
+08/16/2016 10:50:37: CNTKCommandTrainBegin: Train
 NDLBuilder Using GPU 0

-05/13/2016 15:10:47: Creating virgin network.
+08/16/2016 10:50:37: Creating virgin network.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
+Node 'conv1_act.W' (LearnableParameter operation): Initializing Parameter[32 x 75] <- 0.000000.
+Node 'conv1_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 32] <- 0.000000.
+Node 'conv2_act.W' (LearnableParameter operation): Initializing Parameter[32 x 800] <- 0.000000.
+Node 'conv2_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 32] <- 0.000000.
+Node 'conv3_act.W' (LearnableParameter operation): Initializing Parameter[64 x 800] <- 0.000000.
+Node 'conv3_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 64] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 3 x 3 x 64] <- 0.000000.
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- 0.000000.
+Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'conv1_act.W' (LearnableParameter operation): Initializing Parameter[32 x 75] <- gaussian(seed=1, range=0.023094*0.004300, onCPU=false).
 SetGaussianRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+Node 'conv1_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 32] <- 0.000000.
+Node 'conv2_act.W' (LearnableParameter operation): Initializing Parameter[32 x 800] <- gaussian(seed=2, range=0.007071*1.414000, onCPU=false).
+Node 'conv2_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 32] <- 0.000000.
+Node 'conv3_act.W' (LearnableParameter operation): Initializing Parameter[64 x 800] <- gaussian(seed=3, range=0.007071*1.414000, onCPU=false).
+Node 'conv3_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 64] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 3 x 3 x 64] <- gaussian(seed=4, range=0.008333*12.000000, onCPU=false).
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- gaussian(seed=5, range=0.025000*1.500000, onCPU=false).
+Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.

 Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 34 nodes to process in pass 1.
@ -326,7 +360,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1_d) : [10 x 64], [64 x 1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *], [10] -> [10 x 1 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]

 Validating network. 21 nodes to process in pass 2.

@ -334,165 +368,183 @@ Validating network. 21 nodes to process in pass 2.
 Validating network, final pass.


-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv1_act.c: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv2_act.c: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool2: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv3_act.c: using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool3: using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.


 13 out of 34 nodes do not share the minibatch layout with the input data.

 Post-processing network complete.

-05/13/2016 15:10:48: Created model with 34 nodes on GPU 0.
+08/16/2016 10:50:38: Created model with 34 nodes on GPU 0.

-05/13/2016 15:10:48: Training criterion node(s):
-05/13/2016 15:10:48: 	CE = CrossEntropyWithSoftmax
+08/16/2016 10:50:38: Training criterion node(s):
+08/16/2016 10:50:38: 	CE = CrossEntropyWithSoftmax

-05/13/2016 15:10:48: Evaluation criterion node(s):
-
-05/13/2016 15:10:48: 	Err = ClassificationError
+08/16/2016 10:50:38: Evaluation criterion node(s):
+08/16/2016 10:50:38: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

-Memory Sharing Structure:
+Memory Sharing: Out of 63 matrices, 38 are shared as 17, and 25 are not shared.

-(nil): {[Err Gradient[1]] [featOffs Gradient[1 x 1]] [featScaled Gradient[32 x 32 x 3 x *]] [features Gradient[32 x 32 x 3 x *]] [labels Gradient[10 x *]] }
-0x2485d28: {[OutputNodes.z Value[10 x 1 x *]] }
-0x2485ee8: {[CE Value[1]] }
-0x2486168: {[conv1_act.W Gradient[32 x 75]] [conv1_act.p Value[32 x 32 x 32 x *]] }
-0x2486328: {[conv1_act.c Gradient[32 x 32 x 32 x *]] [conv1_act.y Value[32 x 32 x 32 x *]] }
-0x24864e8: {[conv1_act.p Gradient[32 x 32 x 32 x *]] [pool1 Value[15 x 15 x 32 x *]] }
-0x249a638: {[features Value[32 x 32 x 3 x *]] }
-0x2975298: {[conv1_act.b Value[1 x 1 x 32]] }
-0x2976b48: {[conv2_act.W Value[32 x 800]] }
-0x2977ae8: {[conv2_act.b Value[1 x 1 x 32]] }
-0x2979668: {[conv3_act.W Value[64 x 800]] }
-0x2979f08: {[conv3_act.b Value[1 x 1 x 64]] }
-0x297bae8: {[h1.W Value[64 x 3 x 3 x 64]] }
-0x297c538: {[h1.b Value[64 x 1]] }
-0x297d5c8: {[OutputNodes.W Value[10 x 64]] }
-0x297ea98: {[OutputNodes.b Value[10]] }
-0x2dd1458: {[featOffs Value[1 x 1]] }
-0x2dd2678: {[labels Value[10 x *]] }
-0x2dd2eb8: {[conv1_act.W Value[32 x 75]] }
-0x7a59dd8: {[Err Value[1]] }
-0x7a5d378: {[featScaled Value[32 x 32 x 3 x *]] }
-0x7a5d6d8: {[conv1_act.c Value[32 x 32 x 32 x *]] }
-0x7a5e478: {[conv2_act.c Value[15 x 15 x 32 x *]] }
-0x7a5e638: {[conv1_act.b Gradient[1 x 1 x 32]] [conv1_act.y Gradient[32 x 32 x 32 x *]] }
-0x7a5e7f8: {[conv2_act.W Gradient[32 x 800]] [conv2_act.p Value[15 x 15 x 32 x *]] }
-0x7a7ade8: {[conv2_act.c Gradient[15 x 15 x 32 x *]] [conv2_act.y Value[15 x 15 x 32 x *]] }
-0x7a7afa8: {[conv2_act.p Gradient[15 x 15 x 32 x *]] [pool1 Gradient[15 x 15 x 32 x *]] [pool2 Value[7 x 7 x 32 x *]] }
-0x7a7b168: {[conv3_act.c Value[7 x 7 x 64 x *]] }
-0x7a7b328: {[conv2_act.b Gradient[1 x 1 x 32]] [conv2_act.y Gradient[15 x 15 x 32 x *]] }
-0x7a7b4e8: {[conv3_act.W Gradient[64 x 800]] [conv3_act.p Value[7 x 7 x 64 x *]] }
-0x7a7b6a8: {[conv3_act.c Gradient[7 x 7 x 64 x *]] [conv3_act.y Value[7 x 7 x 64 x *]] }
-0x7a7b868: {[conv3_act.p Gradient[7 x 7 x 64 x *]] [pool2 Gradient[7 x 7 x 32 x *]] [pool3 Value[3 x 3 x 64 x *]] }
-0x7a7ba28: {[conv3_act.b Gradient[1 x 1 x 64]] [conv3_act.y Gradient[7 x 7 x 64 x *]] [h1.t Value[64 x *]] }
-0x7a7bbe8: {[h1.W Gradient[64 x 3 x 3 x 64]] [h1.z Value[64 x 1 x *]] }
-0x7a7bda8: {[h1.t Gradient[64 x *]] [h1.y Value[64 x 1 x *]] }
-0x7a7bf68: {[h1_d Value[64 x 1 x *]] }
-0x7a7c128: {[h1.z Gradient[64 x 1 x *]] [pool3 Gradient[3 x 3 x 64 x *]] }
-0x7a7c2e8: {[OutputNodes.t Value[10 x 1 x *]] [h1.b Gradient[64 x 1]] [h1.y Gradient[64 x 1 x *]] }
-0x7a7cdc8: {[CE Gradient[1]] }
-0x7a7cf88: {[OutputNodes.W Gradient[10 x 64]] [OutputNodes.z Gradient[10 x 1 x *]] }
-0x7a7d148: {[OutputNodes.t Gradient[10 x 1 x *]] }
-0x7a7d308: {[OutputNodes.b Gradient[10]] }
-0x7a7d4c8: {[h1_d Gradient[64 x 1 x *]] }
+	{ conv1_act.W : [32 x 75] (gradient)
+	  conv1_act.p : [32 x 32 x 32 x *] }
+	{ conv1_act.c : [32 x 32 x 32 x *] (gradient)
+	  conv1_act.y : [32 x 32 x 32 x *] }
+	{ conv1_act.p : [32 x 32 x 32 x *] (gradient)
+	  pool1 : [15 x 15 x 32 x *] }
+	{ conv1_act.b : [1 x 1 x 32] (gradient)
+	  conv1_act.y : [32 x 32 x 32 x *] (gradient) }
+	{ conv2_act.W : [32 x 800] (gradient)
+	  conv2_act.p : [15 x 15 x 32 x *] }
+	{ conv2_act.c : [15 x 15 x 32 x *] (gradient)
+	  conv2_act.y : [15 x 15 x 32 x *] }
+	{ conv2_act.p : [15 x 15 x 32 x *] (gradient)
+	  pool1 : [15 x 15 x 32 x *] (gradient)
+	  pool2 : [7 x 7 x 32 x *] }
+	{ conv2_act.b : [1 x 1 x 32] (gradient)
+	  conv2_act.y : [15 x 15 x 32 x *] (gradient) }
+	{ conv3_act.W : [64 x 800] (gradient)
+	  conv3_act.p : [7 x 7 x 64 x *] }
+	{ conv3_act.c : [7 x 7 x 64 x *] (gradient)
+	  conv3_act.y : [7 x 7 x 64 x *] }
+	{ conv3_act.p : [7 x 7 x 64 x *] (gradient)
+	  pool2 : [7 x 7 x 32 x *] (gradient)
+	  pool3 : [3 x 3 x 64 x *] }
+	{ conv3_act.b : [1 x 1 x 64] (gradient)
+	  conv3_act.y : [7 x 7 x 64 x *] (gradient)
+	  h1.t : [64 x *] }
+	{ h1.W : [64 x 3 x 3 x 64] (gradient)
+	  h1.z : [64 x 1 x *] }
+	{ h1.t : [64 x *] (gradient)
+	  h1.y : [64 x 1 x *] }
+	{ h1.z : [64 x 1 x *] (gradient)
+	  pool3 : [3 x 3 x 64 x *] (gradient) }
+	{ OutputNodes.t : [10 x 1 x *]
+	  h1.b : [64 x 1] (gradient)
+	  h1.y : [64 x 1 x *] (gradient) }
+	{ OutputNodes.W : [10 x 64] (gradient)
+	  OutputNodes.z : [10 x 1 x *] (gradient) }

-05/13/2016 15:10:48: No PreCompute nodes found, skipping PreCompute step.

-05/13/2016 15:10:48: Starting Epoch 1: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:38: Training 116906 parameters in 10 out of 10 parameter tensors and 29 nodes with gradient:

-05/13/2016 15:10:48: Starting minibatch loop.
-05/13/2016 15:10:51: Finished Epoch[ 1 of 10]: [Training] CE = 2.30242050 * 100; Err = 0.88000000 * 100; totalSamplesSeen = 100; learningRatePerSample = 0.00015625; epochTime=3.55904s
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.1'
+08/16/2016 10:50:38: 	Node 'OutputNodes.W' (LearnableParameter operation) : [10 x 64]
+08/16/2016 10:50:38: 	Node 'OutputNodes.b' (LearnableParameter operation) : [10]
+08/16/2016 10:50:38: 	Node 'conv1_act.W' (LearnableParameter operation) : [32 x 75]
+08/16/2016 10:50:38: 	Node 'conv1_act.b' (LearnableParameter operation) : [1 x 1 x 32]
+08/16/2016 10:50:38: 	Node 'conv2_act.W' (LearnableParameter operation) : [32 x 800]
+08/16/2016 10:50:38: 	Node 'conv2_act.b' (LearnableParameter operation) : [1 x 1 x 32]
+08/16/2016 10:50:38: 	Node 'conv3_act.W' (LearnableParameter operation) : [64 x 800]
+08/16/2016 10:50:38: 	Node 'conv3_act.b' (LearnableParameter operation) : [1 x 1 x 64]
+08/16/2016 10:50:38: 	Node 'h1.W' (LearnableParameter operation) : [64 x 3 x 3 x 64]
+08/16/2016 10:50:38: 	Node 'h1.b' (LearnableParameter operation) : [64 x 1]

-05/13/2016 15:10:51: Starting Epoch 2: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:38: No PreCompute nodes found, or all already computed. Skipping pre-computation step.

-05/13/2016 15:10:51: Starting minibatch loop.
-05/13/2016 15:10:51: Finished Epoch[ 2 of 10]: [Training] CE = 2.30175842 * 100; Err = 0.94000000 * 100; totalSamplesSeen = 200; learningRatePerSample = 0.00015625; epochTime=0.011903s
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.2'
+08/16/2016 10:50:38: Starting Epoch 1: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 0: frames [0..100] (first sequence at sample 0), data subset 0 of 1

-05/13/2016 15:10:51: Starting Epoch 3: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:38: Starting minibatch loop.
+08/16/2016 10:50:41: Finished Epoch[ 1 of 10]: [Training] CE = 2.30223602 * 100; Err = 0.90000000 * 100; totalSamplesSeen = 100; learningRatePerSample = 0.00015625; epochTime=3.51082s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.1'

-05/13/2016 15:10:51: Starting minibatch loop.
-05/13/2016 15:10:51: Finished Epoch[ 3 of 10]: [Training] CE = 2.30054413 * 100; Err = 0.90000000 * 100; totalSamplesSeen = 300; learningRatePerSample = 0.00015625; epochTime=0.012701s
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.3'
+08/16/2016 10:50:41: Starting Epoch 2: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 1: frames [100..200] (first sequence at sample 100), data subset 0 of 1

-05/13/2016 15:10:51: Starting Epoch 4: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting minibatch loop.
+08/16/2016 10:50:41: Finished Epoch[ 2 of 10]: [Training] CE = 2.30189240 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 200; learningRatePerSample = 0.00015625; epochTime=0.012555s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.2'

-05/13/2016 15:10:51: Starting minibatch loop.
-05/13/2016 15:10:51: Finished Epoch[ 4 of 10]: [Training] CE = 2.30022812 * 100; Err = 0.88000000 * 100; totalSamplesSeen = 400; learningRatePerSample = 0.00015625; epochTime=0.01144s
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.4'
+08/16/2016 10:50:41: Starting Epoch 3: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 2: frames [200..300] (first sequence at sample 200), data subset 0 of 1

-05/13/2016 15:10:51: Starting Epoch 5: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting minibatch loop.
+08/16/2016 10:50:41: Finished Epoch[ 3 of 10]: [Training] CE = 2.29965256 * 100; Err = 0.86000000 * 100; totalSamplesSeen = 300; learningRatePerSample = 0.00015625; epochTime=0.012394s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.3'

-05/13/2016 15:10:51: Starting minibatch loop.
-05/13/2016 15:10:51: Finished Epoch[ 5 of 10]: [Training] CE = 2.29579636 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 500; learningRatePerSample = 0.00015625; epochTime=0.011529s
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.5'
+08/16/2016 10:50:41: Starting Epoch 4: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 3: frames [300..400] (first sequence at sample 300), data subset 0 of 1
+
+08/16/2016 10:50:41: Starting minibatch loop.
+08/16/2016 10:50:41: Finished Epoch[ 4 of 10]: [Training] CE = 2.29966064 * 100; Err = 0.91000000 * 100; totalSamplesSeen = 400; learningRatePerSample = 0.00015625; epochTime=0.0124s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.4'
+
+08/16/2016 10:50:41: Starting Epoch 5: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 4: frames [400..500] (first sequence at sample 400), data subset 0 of 1
+
+08/16/2016 10:50:41: Starting minibatch loop.
+08/16/2016 10:50:41: Finished Epoch[ 5 of 10]: [Training] CE = 2.30450394 * 100; Err = 0.94000000 * 100; totalSamplesSeen = 500; learningRatePerSample = 0.00015625; epochTime=0.012302s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.5'
 Setting dropout rate to 0.5.

-05/13/2016 15:10:51: Starting Epoch 6: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 6: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 5: frames [500..600] (first sequence at sample 500), data subset 0 of 1

-05/13/2016 15:10:51: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 5
-05/13/2016 15:10:51: Finished Epoch[ 6 of 10]: [Training] CE = 2.30121231 * 100; Err = 0.84000000 * 100; totalSamplesSeen = 600; learningRatePerSample = 0.00015625; epochTime=0.012276s
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.6'
+08/16/2016 10:50:41: Finished Epoch[ 6 of 10]: [Training] CE = 2.29013916 * 100; Err = 0.81000000 * 100; totalSamplesSeen = 600; learningRatePerSample = 0.00015625; epochTime=0.012412s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.6'

-05/13/2016 15:10:51: Starting Epoch 7: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 7: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 6: frames [600..700] (first sequence at sample 600), data subset 0 of 1

-05/13/2016 15:10:51: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 6
-05/13/2016 15:10:52: Finished Epoch[ 7 of 10]: [Training] CE = 2.28975647 * 100; Err = 0.93000000 * 100; totalSamplesSeen = 700; learningRatePerSample = 0.00015625; epochTime=0.011495s
-05/13/2016 15:10:52: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.7'
+08/16/2016 10:50:41: Finished Epoch[ 7 of 10]: [Training] CE = 2.29815765 * 100; Err = 0.93000000 * 100; totalSamplesSeen = 700; learningRatePerSample = 0.00015625; epochTime=0.012303s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.7'

-05/13/2016 15:10:52: Starting Epoch 8: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 8: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 7: frames [700..800] (first sequence at sample 700), data subset 0 of 1

-05/13/2016 15:10:52: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 7
-05/13/2016 15:10:52: Finished Epoch[ 8 of 10]: [Training] CE = 2.29035095 * 100; Err = 0.91000000 * 100; totalSamplesSeen = 800; learningRatePerSample = 0.00015625; epochTime=0.012157s
-05/13/2016 15:10:52: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.8'
+08/16/2016 10:50:41: Finished Epoch[ 8 of 10]: [Training] CE = 2.28805603 * 100; Err = 0.89000000 * 100; totalSamplesSeen = 800; learningRatePerSample = 0.00015625; epochTime=0.012517s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.8'

-05/13/2016 15:10:52: Starting Epoch 9: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 9: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 8: frames [800..900] (first sequence at sample 800), data subset 0 of 1

-05/13/2016 15:10:52: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 8
-05/13/2016 15:10:52: Finished Epoch[ 9 of 10]: [Training] CE = 2.29797729 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 900; learningRatePerSample = 0.00015625; epochTime=0.011451s
-05/13/2016 15:10:52: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.9'
+08/16/2016 10:50:41: Finished Epoch[ 9 of 10]: [Training] CE = 2.29380524 * 100; Err = 0.88000000 * 100; totalSamplesSeen = 900; learningRatePerSample = 0.00015625; epochTime=0.012463s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.9'

-05/13/2016 15:10:52: Starting Epoch 10: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 10: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+BlockRandomizer::StartEpoch: epoch 9: frames [900..1000] (first sequence at sample 900), data subset 0 of 1

-05/13/2016 15:10:52: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 9
-05/13/2016 15:10:52: Finished Epoch[10 of 10]: [Training] CE = 2.29764435 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 1000; learningRatePerSample = 0.00015625; epochTime=0.012689s
-05/13/2016 15:10:52: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution'
-05/13/2016 15:10:52: CNTKCommandTrainEnd: Train
+08/16/2016 10:50:41: Finished Epoch[10 of 10]: [Training] CE = 2.27814423 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 1000; learningRatePerSample = 0.00015625; epochTime=0.012432s
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution'
+08/16/2016 10:50:41: CNTKCommandTrainEnd: Train

-05/13/2016 15:10:52: Action "train" complete.
+08/16/2016 10:50:41: Action "train" complete.


-05/13/2016 15:10:52: ##############################################################################
-05/13/2016 15:10:52: #                                                                            #
-05/13/2016 15:10:52: # Action "test"                                                              #
-05/13/2016 15:10:52: #                                                                            #
-05/13/2016 15:10:52: ##############################################################################
+08/16/2016 10:50:41: ##############################################################################
+08/16/2016 10:50:41: #                                                                            #
+08/16/2016 10:50:41: # Action "test"                                                              #
+08/16/2016 10:50:41: #                                                                            #
+08/16/2016 10:50:41: ##############################################################################


 Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 34 nodes to process in pass 1.
@ -530,7 +582,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1_d) : [10 x 64], [64 x 1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *1], [10] -> [10 x 1 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]

 Validating network. 21 nodes to process in pass 2.

@ -538,17 +590,17 @@ Validating network. 21 nodes to process in pass 2.
 Validating network, final pass.


-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv1_act.c: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv2_act.c: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool2: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv3_act.c: using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.

-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool3: using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.


 13 out of 34 nodes do not share the minibatch layout with the input data.
@ -560,46 +612,14 @@ evalNodeNames are not specified, using all the default evalnodes and training cr

 Allocating matrices for forward and/or backward propagation.

-Memory Sharing Structure:
+Memory Sharing: Out of 34 matrices, 0 are shared as 0, and 34 are not shared.

-(nil): {[CE Gradient[1]] [Err Gradient[1]] [OutputNodes.W Gradient[10 x 64]] [OutputNodes.b Gradient[10]] [OutputNodes.t Gradient[10 x 1 x *1]] [OutputNodes.z Gradient[10 x 1 x *1]] [conv1_act.W Gradient[32 x 75]] [conv1_act.b Gradient[1 x 1 x 32]] [conv1_act.c Gradient[32 x 32 x 32 x *1]] [conv1_act.p Gradient[32 x 32 x 32 x *1]] [conv1_act.y Gradient[32 x 32 x 32 x *1]] [conv2_act.W Gradient[32 x 800]] [conv2_act.b Gradient[1 x 1 x 32]] [conv2_act.c Gradient[15 x 15 x 32 x *1]] [conv2_act.p Gradient[15 x 15 x 32 x *1]] [conv2_act.y Gradient[15 x 15 x 32 x *1]] [conv3_act.W Gradient[64 x 800]] [conv3_act.b Gradient[1 x 1 x 64]] [conv3_act.c Gradient[7 x 7 x 64 x *1]] [conv3_act.p Gradient[7 x 7 x 64 x *1]] [conv3_act.y Gradient[7 x 7 x 64 x *1]] [featOffs Gradient[1 x 1]] [featScaled Gradient[32 x 32 x 3 x *1]] [features Gradient[32 x 32 x 3 x *1]] [h1.W Gradient[64 x 3 x 3 x 64]] [h1.b Gradient[64 x 1]] [h1.t Gradient[64 x *1]] [h1.y Gradient[64 x 1 x *1]] [h1.z Gradient[64 x 1 x *1]] [h1_d Gradient[64 x 1 x *1]] [labels Gradient[10 x *1]] [pool1 Gradient[15 x 15 x 32 x *1]] [pool2 Gradient[7 x 7 x 32 x *1]] [pool3 Gradient[3 x 3 x 64 x *1]] }
-0x7fc883e04ba8: {[conv1_act.b Value[1 x 1 x 32]] }
-0x7fc883e05fc8: {[conv1_act.W Value[32 x 75]] }
-0x7fc883e06768: {[conv2_act.b Value[1 x 1 x 32]] }
-0x7fc883e06928: {[conv2_act.W Value[32 x 800]] }
-0x7fc883e085b8: {[conv3_act.b Value[1 x 1 x 64]] }
-0x7fc883e09528: {[conv3_act.W Value[64 x 800]] }
-0x7fc883e0b568: {[featOffs Value[1 x 1]] }
-0x7fc883e0c1e8: {[features Value[32 x 32 x 3 x *1]] }
-0x7fc883e0cc38: {[h1.b Value[64 x 1]] }
-0x7fc883e0cf08: {[h1.W Value[64 x 3 x 3 x 64]] }
-0x7fc883e0eb48: {[labels Value[10 x *1]] }
-0x7fc883e0f558: {[OutputNodes.b Value[10]] }
-0x7fc883e10068: {[OutputNodes.W Value[10 x 64]] }
-0x7fc883e286b8: {[Err Value[1]] }
-0x7fc883e2bd28: {[CE Value[1]] }
-0x7fc883e2bfa8: {[conv1_act.y Value[32 x 32 x 32 x *1]] }
-0x7fc883e54728: {[conv1_act.c Value[32 x 32 x 32 x *1]] }
-0x7fc883e54a88: {[featScaled Value[32 x 32 x 3 x *1]] }
-0x7fc883e54c18: {[conv1_act.p Value[32 x 32 x 32 x *1]] }
-0x7fc883e71a78: {[pool1 Value[15 x 15 x 32 x *1]] }
-0x7fc883e71c38: {[conv2_act.c Value[15 x 15 x 32 x *1]] }
-0x7fc883e71fb8: {[conv2_act.p Value[15 x 15 x 32 x *1]] }
-0x7fc883e72178: {[conv2_act.y Value[15 x 15 x 32 x *1]] }
-0x7fc883e72338: {[pool2 Value[7 x 7 x 32 x *1]] }
-0x7fc883e724f8: {[conv3_act.c Value[7 x 7 x 64 x *1]] }
-0x7fc883e72878: {[conv3_act.p Value[7 x 7 x 64 x *1]] }
-0x7fc883e72a38: {[conv3_act.y Value[7 x 7 x 64 x *1]] }
-0x7fc883e72bf8: {[pool3 Value[3 x 3 x 64 x *1]] }
-0x7fc883e72db8: {[h1.t Value[64 x *1]] }
-0x7fc883e72f78: {[h1.z Value[64 x 1 x *1]] }
-0x7fc883e73138: {[h1.y Value[64 x 1 x *1]] }
-0x7fc883e732f8: {[h1_d Value[64 x 1 x *1]] }
-0x7fc883e73678: {[OutputNodes.t Value[10 x 1 x *1]] }
-0x7fc883e73838: {[OutputNodes.z Value[10 x 1 x *1]] }

-05/13/2016 15:10:58: Final Results: Minibatch[1-625]: Err = 0.86430000 * 10000; CE = 2.28476029 * 10000; perplexity = 9.82333117
+BlockRandomizer::StartEpoch: epoch 0: frames [0..10000] (first sequence at sample 0), data subset 0 of 1
+08/16/2016 10:50:43: Minibatch[1-500]: Err = 0.86125000 * 8000; CE = 2.28389484 * 8000
+08/16/2016 10:50:43: Minibatch[501-625]: Err = 0.86350000 * 2000; CE = 2.28027481 * 2000
+08/16/2016 10:50:43: Final Results: Minibatch[1-625]: Err = 0.86170000 * 10000; CE = 2.28317084 * 10000; perplexity = 9.80772986

-05/13/2016 15:10:58: Action "test" complete.
+08/16/2016 10:50:43: Action "test" complete.

-05/13/2016 15:10:58: __COMPLETED__
+08/16/2016 10:50:43: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/baseline.windows.txt
@ -286,7 +286,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 34 nodes to process in pass 1.
@ -324,7 +324,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1_d) : [10 x 64], [64 x 1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *], [10] -> [10 x 1 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]

 Validating network. 21 nodes to process in pass 2.

@ -356,7 +356,7 @@ Post-processing network complete.

 05/13/2016 08:17:53: Evaluation criterion node(s):

-05/13/2016 08:17:53: 	Err = ClassificationError
+05/13/2016 08:17:53: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -490,7 +490,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 34 nodes to process in pass 1.
@ -528,7 +528,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1_d) : [10 x 64], [64 x 1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *1], [10] -> [10 x 1 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]

 Validating network. 21 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.linux.txt
@ -291,7 +291,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 45 nodes to process in pass 1.
@ -340,7 +340,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *]
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]

 Validating network. 20 nodes to process in pass 2.

@ -380,7 +380,7 @@ Post-processing network complete.

 05/13/2016 15:10:59: Evaluation criterion node(s):

-05/13/2016 15:10:59: 	Err = ClassificationError
+05/13/2016 15:10:59: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -491,7 +491,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 45 nodes to process in pass 1.
@ -540,7 +540,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]

 Validating network. 20 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt
@ -289,7 +289,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 45 nodes to process in pass 1.
@ -338,7 +338,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *]
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]

 Validating network. 20 nodes to process in pass 2.

@ -378,7 +378,7 @@ Post-processing network complete.

 05/13/2016 08:18:26: Evaluation criterion node(s):

-05/13/2016 08:18:26: 	Err = ClassificationError
+05/13/2016 08:18:26: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -489,7 +489,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 45 nodes to process in pass 1.
@ -538,7 +538,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]

 Validating network. 20 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/baseline.linux.gpu.txt
@ -356,7 +356,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 184 nodes to process in pass 1.
@ -546,7 +546,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]

 Validating network. 75 nodes to process in pass 2.

@ -652,7 +652,7 @@ Post-processing network complete.

 05/03/2016 18:13:08: Evaluation criterion node(s):

-05/03/2016 18:13:08: 	Err = ClassificationError
+05/03/2016 18:13:08: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -907,7 +907,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 184 nodes to process in pass 1.
@ -1095,7 +1095,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]

 Validating network. 75 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/baseline.windows.txt
@ -354,7 +354,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 184 nodes to process in pass 1.
@ -544,7 +544,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]

 Validating network. 75 nodes to process in pass 2.

@ -650,7 +650,7 @@ Post-processing network complete.

 05/03/2016 14:04:12: Evaluation criterion node(s):

-05/03/2016 14:04:12: 	Err = ClassificationError
+05/03/2016 14:04:12: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -905,7 +905,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 184 nodes to process in pass 1.
@ -1093,7 +1093,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]

 Validating network. 75 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/baseline.linux.gpu.txt
@ -356,7 +356,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 949 nodes to process in pass 1.
@ -1311,7 +1311,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]

 Validating network. 390 nodes to process in pass 2.

@ -1777,7 +1777,7 @@ Post-processing network complete.

 05/03/2016 18:17:55: Evaluation criterion node(s):

-05/03/2016 18:17:55: 	Err = ClassificationError
+05/03/2016 18:17:55: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -2932,7 +2932,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 949 nodes to process in pass 1.
@ -3885,7 +3885,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]

 Validating network. 390 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/baseline.windows.txt
@ -354,7 +354,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 949 nodes to process in pass 1.
@ -1309,7 +1309,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]

 Validating network. 390 nodes to process in pass 2.

@ -1775,7 +1775,7 @@ Post-processing network complete.

 05/03/2016 14:05:00: Evaluation criterion node(s):

-05/03/2016 14:05:00: 	Err = ClassificationError
+05/03/2016 14:05:00: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -2930,7 +2930,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 949 nodes to process in pass 1.
@ -3883,7 +3883,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]

 Validating network. 390 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal/baseline.linux.txt
@ -282,7 +282,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 32 nodes to process in pass 1.
@ -318,7 +318,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, conv4.y) : [10 x 7 x 7 x 32
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]

 Validating network. 19 nodes to process in pass 2.

@ -350,7 +350,7 @@ Post-processing network complete.

 05/13/2016 15:11:11: Evaluation criterion node(s):

-05/13/2016 15:11:11: 	Err = ClassificationError
+05/13/2016 15:11:11: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -446,7 +446,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 32 nodes to process in pass 1.
@ -482,7 +482,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, conv4.y) : [10 x 7 x 7 x 32
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]

 Validating network. 19 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal/baseline.windows.txt
@ -280,7 +280,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 32 nodes to process in pass 1.
@ -316,7 +316,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, conv4.y) : [10 x 7 x 7 x 32
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]

 Validating network. 19 nodes to process in pass 2.

@ -348,7 +348,7 @@ Post-processing network complete.

 05/13/2016 08:19:02: Evaluation criterion node(s):

-05/13/2016 08:19:02: 	Err = ClassificationError
+05/13/2016 08:19:02: 	Err = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.
@ -444,7 +444,7 @@ Post-processing network...

 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()

 Validating network. 32 nodes to process in pass 1.
@ -480,7 +480,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, conv4.y) : [10 x 7 x 7 x 32
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]

 Validating network. 19 nodes to process in pass 2.

--- a/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.linux.cpu.txt
@ -68,7 +68,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -169,7 +169,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -302,7 +302,7 @@ configparameters: Multigpu.cntk:Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -370,7 +370,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -399,7 +399,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -423,14 +423,14 @@ Post-processing network complete.

 05/03/2016 15:21:43: Evaluation criterion node(s):

-05/03/2016 15:21:43: 	EvalClassificationError = ClassificationError
+05/03/2016 15:21:43: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+(nil): {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 0x1abc7c8: {[InvStdOfFeatures Value[2]] }
 0x1b40348: {[features Value[2 x *]] }
 0x1b408b8: {[MeanOfFeatures Value[2]] }
@ -443,7 +443,7 @@ Memory Sharing Structure:
 0x1b46708: {[labels Value[2 x *]] }
 0x1b473e8: {[Prior Value[2]] }
 0x1b4b138: {[ScaledLogLikelihood Value[2 x 1 x *]] }
-0x1b4cc28: {[EvalClassificationError Value[1]] }
+0x1b4cc28: {[EvalErrorPrediction Value[1]] }
 0x1b4cea8: {[CrossEntropyWithSoftmax Value[1]] }
 0x1b4d388: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
 0x1b4d548: {[W0*features+B0 Gradient[50 x 1 x *]] [W1*H1 Value[50 x 1 x *]] }
@ -473,139 +473,139 @@ Memory Sharing Structure:
 05/03/2016 15:21:44: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:44: Starting minibatch loop.
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.69966235 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0538s; samplesPerSecond = 4647.4
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70639648 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.1073s; samplesPerSecond = 2329.6
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70470264 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0631s; samplesPerSecond = 3961.3
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69813501 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0747s; samplesPerSecond = 3346.9
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73551416 * 250; EvalClassificationError = 0.57600000 * 250; time = 0.0900s; samplesPerSecond = 2778.4
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72432324 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0605s; samplesPerSecond = 4135.0
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73327588 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0619s; samplesPerSecond = 4039.0
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70092627 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0769s; samplesPerSecond = 3249.9
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72354980 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0799s; samplesPerSecond = 3129.0
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72148096 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0620s; samplesPerSecond = 4031.5
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69814941 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.1278s; samplesPerSecond = 1955.9
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70699121 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0821s; samplesPerSecond = 3044.1
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69898437 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0755s; samplesPerSecond = 3312.4
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71712695 * 250; EvalClassificationError = 0.54000000 * 250; time = 0.0657s; samplesPerSecond = 3804.8
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69470703 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.1049s; samplesPerSecond = 2382.9
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71375879 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.1180s; samplesPerSecond = 2117.9
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70381641 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.1065s; samplesPerSecond = 2347.9
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71748633 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.2709s; samplesPerSecond = 922.9
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71863281 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.1375s; samplesPerSecond = 1818.4
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70715234 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.1143s; samplesPerSecond = 2186.6
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70401074 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.1079s; samplesPerSecond = 2317.1
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70599414 * 250; EvalClassificationError = 0.48400000 * 250; time = 0.0917s; samplesPerSecond = 2727.7
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69628711 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0923s; samplesPerSecond = 2707.6
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75920898 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0887s; samplesPerSecond = 2819.0
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70542578 * 250; EvalClassificationError = 0.43600000 * 250; time = 0.0634s; samplesPerSecond = 3945.8
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70643945 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0885s; samplesPerSecond = 2823.7
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72481641 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0601s; samplesPerSecond = 4162.6
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71133594 * 250; EvalClassificationError = 0.55600000 * 250; time = 0.0630s; samplesPerSecond = 3968.1
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68605664 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0849s; samplesPerSecond = 2944.1
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69535352 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0879s; samplesPerSecond = 2844.6
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.68741797 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0752s; samplesPerSecond = 3325.7
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.67916406 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0958s; samplesPerSecond = 2610.3
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.67841992 * 250; EvalClassificationError = 0.44800000 * 250; time = 0.1009s; samplesPerSecond = 2478.7
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68038477 * 250; EvalClassificationError = 0.49200000 * 250; time = 0.1607s; samplesPerSecond = 1555.6
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.61937109 * 250; EvalClassificationError = 0.30400000 * 250; time = 0.1131s; samplesPerSecond = 2211.4
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.57844141 * 250; EvalClassificationError = 0.27200000 * 250; time = 0.1047s; samplesPerSecond = 2388.5
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.49124023 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0896s; samplesPerSecond = 2791.5
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.39071289 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0727s; samplesPerSecond = 3438.8
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.27650586 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.2624s; samplesPerSecond = 952.6
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.26430078 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0842s; samplesPerSecond = 2967.7
-05/03/2016 15:21:47: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.66664150 * 10000; EvalClassificationError = 0.44430000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=3.93174s
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.69966235 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0538s; samplesPerSecond = 4647.4
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70639648 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.1073s; samplesPerSecond = 2329.6
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70470264 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0631s; samplesPerSecond = 3961.3
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69813501 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0747s; samplesPerSecond = 3346.9
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73551416 * 250; EvalErrorPrediction = 0.57600000 * 250; time = 0.0900s; samplesPerSecond = 2778.4
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72432324 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0605s; samplesPerSecond = 4135.0
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73327588 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0619s; samplesPerSecond = 4039.0
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70092627 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0769s; samplesPerSecond = 3249.9
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72354980 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0799s; samplesPerSecond = 3129.0
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72148096 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0620s; samplesPerSecond = 4031.5
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69814941 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.1278s; samplesPerSecond = 1955.9
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70699121 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0821s; samplesPerSecond = 3044.1
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69898437 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0755s; samplesPerSecond = 3312.4
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71712695 * 250; EvalErrorPrediction = 0.54000000 * 250; time = 0.0657s; samplesPerSecond = 3804.8
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69470703 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.1049s; samplesPerSecond = 2382.9
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71375879 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.1180s; samplesPerSecond = 2117.9
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70381641 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.1065s; samplesPerSecond = 2347.9
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71748633 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.2709s; samplesPerSecond = 922.9
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71863281 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.1375s; samplesPerSecond = 1818.4
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70715234 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.1143s; samplesPerSecond = 2186.6
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70401074 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.1079s; samplesPerSecond = 2317.1
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70599414 * 250; EvalErrorPrediction = 0.48400000 * 250; time = 0.0917s; samplesPerSecond = 2727.7
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69628711 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0923s; samplesPerSecond = 2707.6
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75920898 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0887s; samplesPerSecond = 2819.0
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70542578 * 250; EvalErrorPrediction = 0.43600000 * 250; time = 0.0634s; samplesPerSecond = 3945.8
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70643945 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0885s; samplesPerSecond = 2823.7
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72481641 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0601s; samplesPerSecond = 4162.6
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71133594 * 250; EvalErrorPrediction = 0.55600000 * 250; time = 0.0630s; samplesPerSecond = 3968.1
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68605664 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0849s; samplesPerSecond = 2944.1
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69535352 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0879s; samplesPerSecond = 2844.6
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.68741797 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0752s; samplesPerSecond = 3325.7
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.67916406 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0958s; samplesPerSecond = 2610.3
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.67841992 * 250; EvalErrorPrediction = 0.44800000 * 250; time = 0.1009s; samplesPerSecond = 2478.7
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68038477 * 250; EvalErrorPrediction = 0.49200000 * 250; time = 0.1607s; samplesPerSecond = 1555.6
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.61937109 * 250; EvalErrorPrediction = 0.30400000 * 250; time = 0.1131s; samplesPerSecond = 2211.4
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.57844141 * 250; EvalErrorPrediction = 0.27200000 * 250; time = 0.1047s; samplesPerSecond = 2388.5
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.49124023 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0896s; samplesPerSecond = 2791.5
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.39071289 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0727s; samplesPerSecond = 3438.8
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.27650586 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.2624s; samplesPerSecond = 952.6
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.26430078 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0842s; samplesPerSecond = 2967.7
+05/03/2016 15:21:47: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.66664150 * 10000; EvalErrorPrediction = 0.44430000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=3.93174s
 05/03/2016 15:21:47: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn.1'

 05/03/2016 15:21:47: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:47: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.20720006 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0545s; samplesPerSecond = 4583.4
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.19690290 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0641s; samplesPerSecond = 3899.7
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.16064646 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0770s; samplesPerSecond = 3247.1
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13547171 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0640s; samplesPerSecond = 3904.2
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.18000261 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0732s; samplesPerSecond = 3413.6
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17787841 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0790s; samplesPerSecond = 3164.0
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16821879 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0880s; samplesPerSecond = 2839.4
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16363456 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0854s; samplesPerSecond = 2926.8
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19533907 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0774s; samplesPerSecond = 3228.6
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19318692 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0820s; samplesPerSecond = 3049.5
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.12726279 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0766s; samplesPerSecond = 3261.6
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.18620067 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0773s; samplesPerSecond = 3235.5
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.11547500 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0797s; samplesPerSecond = 3136.6
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16675950 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0833s; samplesPerSecond = 2999.8
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.15807389 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0822s; samplesPerSecond = 3042.5
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18389093 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0726s; samplesPerSecond = 3443.0
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18269750 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0897s; samplesPerSecond = 2787.7
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18737841 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0963s; samplesPerSecond = 2597.3
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20174757 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0811s; samplesPerSecond = 3081.1
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13336708 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0732s; samplesPerSecond = 3414.6
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13851332 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0879s; samplesPerSecond = 2843.0
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15422288 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0821s; samplesPerSecond = 3044.3
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15478799 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0815s; samplesPerSecond = 3069.2
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14530201 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0810s; samplesPerSecond = 3086.3
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12192809 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.2596s; samplesPerSecond = 962.9
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.13975597 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0569s; samplesPerSecond = 4394.5
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12566363 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0911s; samplesPerSecond = 2744.6
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18963051 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0765s; samplesPerSecond = 3267.2
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17955467 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0914s; samplesPerSecond = 2736.4
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18862103 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0772s; samplesPerSecond = 3236.7
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17503073 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0775s; samplesPerSecond = 3225.8
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14741998 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0774s; samplesPerSecond = 3230.1
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13803981 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0726s; samplesPerSecond = 3443.0
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.14139232 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0820s; samplesPerSecond = 3048.4
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13886877 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0766s; samplesPerSecond = 3264.1
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.15025864 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0852s; samplesPerSecond = 2933.5
-05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14659342 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0903s; samplesPerSecond = 2767.4
-05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13078795 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0784s; samplesPerSecond = 3187.6
-05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19832882 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0772s; samplesPerSecond = 3240.4
-05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15828904 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0721s; samplesPerSecond = 3468.7
-05/03/2016 15:21:51: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16210811 * 10000; EvalClassificationError = 0.07480000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=3.34279s
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.20720006 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0545s; samplesPerSecond = 4583.4
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.19690290 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0641s; samplesPerSecond = 3899.7
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.16064646 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0770s; samplesPerSecond = 3247.1
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13547171 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0640s; samplesPerSecond = 3904.2
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.18000261 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0732s; samplesPerSecond = 3413.6
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17787841 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0790s; samplesPerSecond = 3164.0
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16821879 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0880s; samplesPerSecond = 2839.4
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16363456 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0854s; samplesPerSecond = 2926.8
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19533907 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0774s; samplesPerSecond = 3228.6
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19318692 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0820s; samplesPerSecond = 3049.5
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.12726279 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0766s; samplesPerSecond = 3261.6
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.18620067 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0773s; samplesPerSecond = 3235.5
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.11547500 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0797s; samplesPerSecond = 3136.6
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16675950 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0833s; samplesPerSecond = 2999.8
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.15807389 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0822s; samplesPerSecond = 3042.5
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18389093 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0726s; samplesPerSecond = 3443.0
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18269750 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0897s; samplesPerSecond = 2787.7
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18737841 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0963s; samplesPerSecond = 2597.3
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20174757 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0811s; samplesPerSecond = 3081.1
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13336708 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0732s; samplesPerSecond = 3414.6
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13851332 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0879s; samplesPerSecond = 2843.0
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15422288 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0821s; samplesPerSecond = 3044.3
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15478799 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0815s; samplesPerSecond = 3069.2
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14530201 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0810s; samplesPerSecond = 3086.3
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12192809 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.2596s; samplesPerSecond = 962.9
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.13975597 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0569s; samplesPerSecond = 4394.5
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12566363 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0911s; samplesPerSecond = 2744.6
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18963051 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0765s; samplesPerSecond = 3267.2
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17955467 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0914s; samplesPerSecond = 2736.4
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18862103 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0772s; samplesPerSecond = 3236.7
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17503073 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0775s; samplesPerSecond = 3225.8
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14741998 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0774s; samplesPerSecond = 3230.1
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13803981 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0726s; samplesPerSecond = 3443.0
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.14139232 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0820s; samplesPerSecond = 3048.4
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13886877 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0766s; samplesPerSecond = 3264.1
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.15025864 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0852s; samplesPerSecond = 2933.5
+05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14659342 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0903s; samplesPerSecond = 2767.4
+05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13078795 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0784s; samplesPerSecond = 3187.6
+05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19832882 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0772s; samplesPerSecond = 3240.4
+05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15828904 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0721s; samplesPerSecond = 3468.7
+05/03/2016 15:21:51: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16210811 * 10000; EvalErrorPrediction = 0.07480000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=3.34279s
 05/03/2016 15:21:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn.2'

 05/03/2016 15:21:51: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:51: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.19031988 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0960s; samplesPerSecond = 2604.5
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13920714 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0967s; samplesPerSecond = 2585.3
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14595162 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0869s; samplesPerSecond = 2877.8
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13324012 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0817s; samplesPerSecond = 3060.5
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17358728 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0804s; samplesPerSecond = 3109.2
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17949159 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0660s; samplesPerSecond = 3788.1
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.15009323 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0653s; samplesPerSecond = 3829.5
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17060954 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0660s; samplesPerSecond = 3787.3
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10410764 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0762s; samplesPerSecond = 3280.0
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20572259 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.2571s; samplesPerSecond = 972.5
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16519130 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0640s; samplesPerSecond = 3906.2
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.14908187 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0593s; samplesPerSecond = 4213.2
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19227612 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0688s; samplesPerSecond = 3632.8
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13670934 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0532s; samplesPerSecond = 4700.3
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.21113164 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0693s; samplesPerSecond = 3609.4
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.13129944 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0882s; samplesPerSecond = 2833.6
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17304376 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0840s; samplesPerSecond = 2975.2
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16479250 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0685s; samplesPerSecond = 3648.5
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14591786 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0976s; samplesPerSecond = 2561.0
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12562012 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0969s; samplesPerSecond = 2580.7
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13442773 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0959s; samplesPerSecond = 2607.8
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.17125328 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0754s; samplesPerSecond = 3314.6
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22482522 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.1037s; samplesPerSecond = 2410.8
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18291792 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0650s; samplesPerSecond = 3844.3
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.20296558 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0823s; samplesPerSecond = 3038.9
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22849719 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0828s; samplesPerSecond = 3020.2
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12500068 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0864s; samplesPerSecond = 2894.1
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15719802 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0840s; samplesPerSecond = 2976.4
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11520810 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0687s; samplesPerSecond = 3636.7
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14159592 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0974s; samplesPerSecond = 2567.1
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18509569 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0721s; samplesPerSecond = 3465.4
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15008345 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0905s; samplesPerSecond = 2763.6
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12866435 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0902s; samplesPerSecond = 2770.5
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17640526 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0896s; samplesPerSecond = 2789.2
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14982110 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.2845s; samplesPerSecond = 878.8
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11472753 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0867s; samplesPerSecond = 2882.5
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16524783 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0755s; samplesPerSecond = 3312.4
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14961037 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0958s; samplesPerSecond = 2608.8
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.15972387 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0972s; samplesPerSecond = 2572.7
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17867958 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0969s; samplesPerSecond = 2581.0
-05/03/2016 15:21:54: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.16073358 * 10000; EvalClassificationError = 0.07780000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=3.65495s
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.19031988 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0960s; samplesPerSecond = 2604.5
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13920714 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0967s; samplesPerSecond = 2585.3
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14595162 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0869s; samplesPerSecond = 2877.8
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13324012 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0817s; samplesPerSecond = 3060.5
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17358728 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0804s; samplesPerSecond = 3109.2
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17949159 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0660s; samplesPerSecond = 3788.1
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.15009323 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0653s; samplesPerSecond = 3829.5
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17060954 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0660s; samplesPerSecond = 3787.3
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10410764 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0762s; samplesPerSecond = 3280.0
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20572259 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.2571s; samplesPerSecond = 972.5
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16519130 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0640s; samplesPerSecond = 3906.2
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.14908187 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0593s; samplesPerSecond = 4213.2
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19227612 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0688s; samplesPerSecond = 3632.8
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13670934 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0532s; samplesPerSecond = 4700.3
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.21113164 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0693s; samplesPerSecond = 3609.4
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.13129944 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0882s; samplesPerSecond = 2833.6
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17304376 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0840s; samplesPerSecond = 2975.2
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16479250 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0685s; samplesPerSecond = 3648.5
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14591786 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0976s; samplesPerSecond = 2561.0
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12562012 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0969s; samplesPerSecond = 2580.7
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13442773 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0959s; samplesPerSecond = 2607.8
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.17125328 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0754s; samplesPerSecond = 3314.6
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22482522 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.1037s; samplesPerSecond = 2410.8
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18291792 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0650s; samplesPerSecond = 3844.3
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.20296558 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0823s; samplesPerSecond = 3038.9
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22849719 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0828s; samplesPerSecond = 3020.2
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12500068 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0864s; samplesPerSecond = 2894.1
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15719802 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0840s; samplesPerSecond = 2976.4
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11520810 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0687s; samplesPerSecond = 3636.7
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14159592 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0974s; samplesPerSecond = 2567.1
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18509569 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0721s; samplesPerSecond = 3465.4
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15008345 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0905s; samplesPerSecond = 2763.6
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12866435 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0902s; samplesPerSecond = 2770.5
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17640526 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0896s; samplesPerSecond = 2789.2
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14982110 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.2845s; samplesPerSecond = 878.8
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11472753 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0867s; samplesPerSecond = 2882.5
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16524783 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0755s; samplesPerSecond = 3312.4
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14961037 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0958s; samplesPerSecond = 2608.8
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.15972387 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0972s; samplesPerSecond = 2572.7
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17867958 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0969s; samplesPerSecond = 2581.0
+05/03/2016 15:21:54: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.16073358 * 10000; EvalErrorPrediction = 0.07780000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=3.65495s
 05/03/2016 15:21:54: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn'
 05/03/2016 15:21:54: CNTKCommandTrainEnd: Multigpu_Demo_Train

@ -623,7 +623,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -652,7 +652,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -676,7 +676,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0x1abbf28: {[B0 Value[50 x 1]] }
 0x1b47908: {[W1 Value[50 x 50]] }
 0x1b48278: {[W2 Value[2 x 50]] }
@ -688,7 +688,7 @@ Memory Sharing Structure:
 0x1b50cd8: {[Prior Value[2]] }
 0x1b514f8: {[W0 Value[50 x 2]] }
 0x1b53938: {[B1 Value[50 x 1]] }
-0x1c0fd98: {[EvalClassificationError Value[1]] }
+0x1c0fd98: {[EvalErrorPrediction Value[1]] }
 0x1c0fef8: {[CrossEntropyWithSoftmax Value[1]] }
 0x1c10438: {[LogOfPrior Value[2]] }
 0x1c11f48: {[MVNormalizedFeatures Value[2 x *1]] }
@ -701,7 +701,7 @@ Memory Sharing Structure:
 0x1c12d78: {[W2*H1 Value[2 x 1 x *1]] }
 0x1c12f38: {[HLast Value[2 x 1 x *1]] }

-05/03/2016 15:21:55: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05804312 * 603; CrossEntropyWithSoftmax = 0.12790061 * 603; perplexity = 1.13644005
+05/03/2016 15:21:55: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05804312 * 603; CrossEntropyWithSoftmax = 0.12790061 * 603; perplexity = 1.13644005

 05/03/2016 15:21:55: Action "test" complete.

--- a/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.linux.gpu.txt
@ -68,7 +68,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -169,7 +169,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -302,7 +302,7 @@ configparameters: Multigpu.cntk:Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -371,7 +371,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -400,7 +400,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -424,14 +424,14 @@ Post-processing network complete.

 05/03/2016 15:21:55: Evaluation criterion node(s):

-05/03/2016 15:21:55: 	EvalClassificationError = ClassificationError
+05/03/2016 15:21:55: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+(nil): {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 0x12a62e8: {[features Value[2 x *]] }
 0x20202b8: {[MeanOfFeatures Value[2]] }
 0x20207c8: {[InvStdOfFeatures Value[2]] }
@ -444,7 +444,7 @@ Memory Sharing Structure:
 0x278ae18: {[Prior Value[2]] }
 0x278c158: {[LogOfPrior Value[2]] }
 0x27908f8: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
-0x2790a18: {[EvalClassificationError Value[1]] }
+0x2790a18: {[EvalErrorPrediction Value[1]] }
 0x2790d18: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 0x2790e78: {[CrossEntropyWithSoftmax Value[1]] }
 0x27966e8: {[B0 Value[50 x 1]] }
@ -474,139 +474,139 @@ Memory Sharing Structure:
 05/03/2016 15:21:56: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:56: Starting minibatch loop.
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70004456 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0059s; samplesPerSecond = 42038.0
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70309900 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0049s; samplesPerSecond = 50525.5
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70606104 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0050s; samplesPerSecond = 50423.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69845532 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0049s; samplesPerSecond = 50689.4
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73496533 * 250; EvalClassificationError = 0.57600000 * 250; time = 0.0050s; samplesPerSecond = 50261.4
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72522827 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0050s; samplesPerSecond = 50454.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73287500 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0049s; samplesPerSecond = 50576.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70135547 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0049s; samplesPerSecond = 50566.3
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72466504 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0049s; samplesPerSecond = 50515.3
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72187500 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0049s; samplesPerSecond = 50730.5
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69799023 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0049s; samplesPerSecond = 50751.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70696387 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0050s; samplesPerSecond = 50454.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69863965 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0050s; samplesPerSecond = 50393.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71772461 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0048s; samplesPerSecond = 51899.5
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69526270 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0047s; samplesPerSecond = 53544.7
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71436426 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0047s; samplesPerSecond = 53498.8
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70399316 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0047s; samplesPerSecond = 53694.2
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71745508 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 53879.3
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71963184 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0047s; samplesPerSecond = 53521.7
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70689941 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0047s; samplesPerSecond = 53602.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70425098 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 53890.9
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70622754 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0047s; samplesPerSecond = 53728.8
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69729492 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 53786.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75974219 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54265.2
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70631250 * 250; EvalClassificationError = 0.43600000 * 250; time = 0.0047s; samplesPerSecond = 53659.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70705664 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0047s; samplesPerSecond = 53602.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72660352 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54124.3
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71369727 * 250; EvalClassificationError = 0.55600000 * 250; time = 0.0047s; samplesPerSecond = 53441.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68916602 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0047s; samplesPerSecond = 53659.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69964844 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0047s; samplesPerSecond = 53339.0
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.69387891 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 53832.9
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.68885742 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0047s; samplesPerSecond = 53350.4
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69388867 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0047s; samplesPerSecond = 53430.2
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.70363867 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 53960.7
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65449219 * 250; EvalClassificationError = 0.44400000 * 250; time = 0.0047s; samplesPerSecond = 53544.7
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64607031 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0047s; samplesPerSecond = 53453.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.59492969 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 53972.4
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.53965820 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0047s; samplesPerSecond = 53636.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.43681445 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0047s; samplesPerSecond = 52854.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37407422 * 250; EvalClassificationError = 0.12000000 * 250; time = 0.0047s; samplesPerSecond = 53521.7
-05/03/2016 15:21:56: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68409629 * 10000; EvalClassificationError = 0.45780000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.194983s
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70004456 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0059s; samplesPerSecond = 42038.0
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70309900 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0049s; samplesPerSecond = 50525.5
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70606104 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0050s; samplesPerSecond = 50423.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69845532 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0049s; samplesPerSecond = 50689.4
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73496533 * 250; EvalErrorPrediction = 0.57600000 * 250; time = 0.0050s; samplesPerSecond = 50261.4
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72522827 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0050s; samplesPerSecond = 50454.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73287500 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0049s; samplesPerSecond = 50576.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70135547 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0049s; samplesPerSecond = 50566.3
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72466504 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0049s; samplesPerSecond = 50515.3
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72187500 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0049s; samplesPerSecond = 50730.5
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69799023 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0049s; samplesPerSecond = 50751.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70696387 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0050s; samplesPerSecond = 50454.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69863965 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0050s; samplesPerSecond = 50393.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71772461 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0048s; samplesPerSecond = 51899.5
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69526270 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0047s; samplesPerSecond = 53544.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71436426 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0047s; samplesPerSecond = 53498.8
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70399316 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0047s; samplesPerSecond = 53694.2
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71745508 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 53879.3
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71963184 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0047s; samplesPerSecond = 53521.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70689941 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0047s; samplesPerSecond = 53602.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70425098 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 53890.9
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70622754 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0047s; samplesPerSecond = 53728.8
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69729492 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 53786.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75974219 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54265.2
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70631250 * 250; EvalErrorPrediction = 0.43600000 * 250; time = 0.0047s; samplesPerSecond = 53659.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70705664 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0047s; samplesPerSecond = 53602.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72660352 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54124.3
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71369727 * 250; EvalErrorPrediction = 0.55600000 * 250; time = 0.0047s; samplesPerSecond = 53441.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68916602 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0047s; samplesPerSecond = 53659.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69964844 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0047s; samplesPerSecond = 53339.0
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.69387891 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 53832.9
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.68885742 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0047s; samplesPerSecond = 53350.4
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69388867 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0047s; samplesPerSecond = 53430.2
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.70363867 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 53960.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65449219 * 250; EvalErrorPrediction = 0.44400000 * 250; time = 0.0047s; samplesPerSecond = 53544.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64607031 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0047s; samplesPerSecond = 53453.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.59492969 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 53972.4
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.53965820 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0047s; samplesPerSecond = 53636.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.43681445 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0047s; samplesPerSecond = 52854.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37407422 * 250; EvalErrorPrediction = 0.12000000 * 250; time = 0.0047s; samplesPerSecond = 53521.7
+05/03/2016 15:21:56: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68409629 * 10000; EvalErrorPrediction = 0.45780000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.194983s
 05/03/2016 15:21:56: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn.1'

 05/03/2016 15:21:56: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:56: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.27919647 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0093s; samplesPerSecond = 26818.3
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.24468611 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0080s; samplesPerSecond = 31063.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.19639892 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30982.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16397861 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0080s; samplesPerSecond = 31222.7
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.19745002 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30944.4
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.19548896 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0081s; samplesPerSecond = 30871.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.18230148 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30910.0
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17531255 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0080s; samplesPerSecond = 31059.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20166559 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0081s; samplesPerSecond = 30944.4
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19749058 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0081s; samplesPerSecond = 31055.9
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.13463336 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0081s; samplesPerSecond = 30963.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.19006259 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0080s; samplesPerSecond = 31063.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.12234776 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0079s; samplesPerSecond = 31605.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16962922 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32649.9
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16091639 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32743.9
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18624030 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32748.2
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18465726 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18514518 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0077s; samplesPerSecond = 32620.0
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20127224 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13418547 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32701.1
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13995001 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15602538 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32907.7
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15448171 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32864.5
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14780067 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32894.7
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12361633 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0077s; samplesPerSecond = 32628.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14079766 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32632.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12624363 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18913222 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32894.7
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17952681 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0076s; samplesPerSecond = 32786.9
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18825452 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17517656 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32942.4
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14744161 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13888184 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32795.5
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.14156678 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32855.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13990591 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0077s; samplesPerSecond = 32607.3
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.15059729 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32855.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14720846 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0076s; samplesPerSecond = 32799.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13021243 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32912.1
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19704037 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0076s; samplesPerSecond = 33029.5
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15858146 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32860.1
-05/03/2016 15:21:56: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16938752 * 10000; EvalClassificationError = 0.07430000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.313881s
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.27919647 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0093s; samplesPerSecond = 26818.3
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.24468611 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0080s; samplesPerSecond = 31063.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.19639892 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30982.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16397861 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0080s; samplesPerSecond = 31222.7
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.19745002 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30944.4
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.19548896 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0081s; samplesPerSecond = 30871.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.18230148 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30910.0
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17531255 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0080s; samplesPerSecond = 31059.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20166559 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0081s; samplesPerSecond = 30944.4
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19749058 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0081s; samplesPerSecond = 31055.9
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.13463336 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0081s; samplesPerSecond = 30963.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.19006259 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0080s; samplesPerSecond = 31063.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.12234776 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0079s; samplesPerSecond = 31605.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16962922 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32649.9
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16091639 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32743.9
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18624030 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32748.2
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18465726 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18514518 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0077s; samplesPerSecond = 32620.0
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20127224 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13418547 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32701.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13995001 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15602538 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32907.7
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15448171 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32864.5
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14780067 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32894.7
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12361633 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0077s; samplesPerSecond = 32628.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14079766 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32632.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12624363 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18913222 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32894.7
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17952681 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0076s; samplesPerSecond = 32786.9
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18825452 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17517656 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32942.4
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14744161 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13888184 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32795.5
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.14156678 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32855.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13990591 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0077s; samplesPerSecond = 32607.3
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.15059729 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32855.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14720846 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0076s; samplesPerSecond = 32799.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13021243 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32912.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19704037 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0076s; samplesPerSecond = 33029.5
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15858146 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32860.1
+05/03/2016 15:21:56: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16938752 * 10000; EvalErrorPrediction = 0.07430000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.313881s
 05/03/2016 15:21:56: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn.2'

 05/03/2016 15:21:56: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:56: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18888809 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0078s; samplesPerSecond = 32129.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.14084978 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32756.8
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14561895 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32666.9
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13238169 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32752.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17465335 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32765.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17752616 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32821.3
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.15030556 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0077s; samplesPerSecond = 32645.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17118019 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0077s; samplesPerSecond = 32611.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10379908 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0077s; samplesPerSecond = 32637.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20636150 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32782.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16606704 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0077s; samplesPerSecond = 32543.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.14937580 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32446.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19161901 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32731.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13684752 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32696.8
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.21095939 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32688.3
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.13216461 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32769.7
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17341094 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0077s; samplesPerSecond = 32586.0
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16532641 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0076s; samplesPerSecond = 32868.8
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14614740 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32696.8
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12551177 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32705.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13419939 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32782.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.17050096 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22579789 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18219666 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0078s; samplesPerSecond = 32220.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.20347898 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22972656 * 250; EvalClassificationError = 0.12000000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12621914 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0076s; samplesPerSecond = 32890.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15674728 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32808.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11517532 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0077s; samplesPerSecond = 32658.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14187870 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32860.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18496784 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32929.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15026403 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32942.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12862609 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32925.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17651362 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32778.3
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14975908 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0076s; samplesPerSecond = 32981.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11465866 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16513610 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32808.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14972374 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32977.2
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.15995582 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17898927 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32756.8
-05/03/2016 15:21:56: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.16083773 * 10000; EvalClassificationError = 0.07760000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.307973s
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18888809 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0078s; samplesPerSecond = 32129.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.14084978 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32756.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14561895 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32666.9
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13238169 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32752.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17465335 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32765.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17752616 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32821.3
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.15030556 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0077s; samplesPerSecond = 32645.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17118019 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0077s; samplesPerSecond = 32611.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10379908 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0077s; samplesPerSecond = 32637.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20636150 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32782.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16606704 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0077s; samplesPerSecond = 32543.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.14937580 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32446.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19161901 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32731.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13684752 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32696.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.21095939 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32688.3
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.13216461 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32769.7
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17341094 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0077s; samplesPerSecond = 32586.0
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16532641 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0076s; samplesPerSecond = 32868.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14614740 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32696.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12551177 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32705.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13419939 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32782.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.17050096 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22579789 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18219666 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0078s; samplesPerSecond = 32220.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.20347898 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22972656 * 250; EvalErrorPrediction = 0.12000000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12621914 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0076s; samplesPerSecond = 32890.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15674728 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32808.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11517532 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0077s; samplesPerSecond = 32658.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14187870 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32860.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18496784 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32929.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15026403 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32942.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12862609 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32925.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17651362 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32778.3
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14975908 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0076s; samplesPerSecond = 32981.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11465866 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16513610 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32808.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14972374 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32977.2
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.15995582 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17898927 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32756.8
+05/03/2016 15:21:56: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.16083773 * 10000; EvalErrorPrediction = 0.07760000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.307973s
 05/03/2016 15:21:56: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn'
 05/03/2016 15:21:56: CNTKCommandTrainEnd: Multigpu_Demo_Train

@ -624,7 +624,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -653,7 +653,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -677,7 +677,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0x1222268: {[InvStdOfFeatures Value[2]] }
 0x1223258: {[W2 Value[2 x 50]] }
 0x12a56c8: {[B0 Value[50 x 1]] }
@ -697,12 +697,12 @@ Memory Sharing Structure:
 0x2adcc08: {[W0*features Value[50 x *1]] }
 0x2add0a8: {[W0 Value[50 x 2]] }
 0x2ae0518: {[W1 Value[50 x 50]] }
-0x68bf228: {[EvalClassificationError Value[1]] }
+0x68bf228: {[EvalErrorPrediction Value[1]] }
 0x68bf388: {[CrossEntropyWithSoftmax Value[1]] }
 0x68bf988: {[LogOfPrior Value[2]] }
 0x68d0438: {[features Value[2 x *1]] }

-05/03/2016 15:21:57: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05804312 * 603; CrossEntropyWithSoftmax = 0.12736577 * 603; perplexity = 1.13583240
+05/03/2016 15:21:57: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05804312 * 603; CrossEntropyWithSoftmax = 0.12736577 * 603; perplexity = 1.13583240

 05/03/2016 15:21:57: Action "test" complete.

--- a/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.windows.cpu.txt
@ -66,7 +66,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -167,7 +167,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -300,7 +300,7 @@ configparameters: Multigpu.cntk:Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -368,7 +368,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -397,7 +397,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -421,14 +421,14 @@ Post-processing network complete.

 05/03/2016 15:29:48: Evaluation criterion node(s):

-05/03/2016 15:29:48: 	EvalClassificationError = ClassificationError
+05/03/2016 15:29:48: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+0000000000000000: {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 000000CDDFBEECA0: {[features Value[2 x *]] }
 000000CDDFC7B170: {[W0*features+B0 Gradient[50 x 1 x *]] [W1*H1 Value[50 x 1 x *]] }
 000000CDDFC7B490: {[HLast Value[2 x 1 x *]] [W2 Gradient[2 x 50]] }
@ -438,7 +438,7 @@ Memory Sharing Structure:
 000000CDDFC7B990: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
 000000CDDFC7BC10: {[LogOfPrior Value[2]] }
 000000CDDFC7BCB0: {[MVNormalizedFeatures Value[2 x *]] }
-000000CDDFC7BD50: {[EvalClassificationError Value[1]] }
+000000CDDFC7BD50: {[EvalErrorPrediction Value[1]] }
 000000CDDFC7BDF0: {[W0 Gradient[50 x 2]] [W0*features+B0 Value[50 x 1 x *]] }
 000000CDDFC7BF30: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 000000CDDFC7C070: {[H2 Value[50 x 1 x *]] [W1*H1 Gradient[50 x 1 x *]] }
@ -471,139 +471,139 @@ Memory Sharing Structure:
 05/03/2016 15:29:48: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:29:48: Starting minibatch loop.
-05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70511987 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0377s; samplesPerSecond = 6637.8
-05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69754895 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0300s; samplesPerSecond = 8341.4
-05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71056921 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0285s; samplesPerSecond = 8758.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72951074 * 250; EvalClassificationError = 0.56000000 * 250; time = 0.0290s; samplesPerSecond = 8610.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70946655 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0285s; samplesPerSecond = 8776.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72656787 * 250; EvalClassificationError = 0.54400000 * 250; time = 0.0289s; samplesPerSecond = 8652.6
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69337402 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0288s; samplesPerSecond = 8670.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73605176 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0277s; samplesPerSecond = 9033.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71453076 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0271s; samplesPerSecond = 9209.5
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75191992 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0247s; samplesPerSecond = 10134.6
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75975146 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0270s; samplesPerSecond = 9243.5
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73172168 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0268s; samplesPerSecond = 9333.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76840820 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0265s; samplesPerSecond = 9435.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70464746 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0269s; samplesPerSecond = 9309.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70557227 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0253s; samplesPerSecond = 9880.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72711816 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0267s; samplesPerSecond = 9357.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70076660 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0270s; samplesPerSecond = 9264.1
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69409766 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0257s; samplesPerSecond = 9716.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69139941 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0257s; samplesPerSecond = 9742.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73361621 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0295s; samplesPerSecond = 8477.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72225879 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0273s; samplesPerSecond = 9161.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70356348 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0261s; samplesPerSecond = 9562.8
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69928613 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0254s; samplesPerSecond = 9848.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72360938 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0252s; samplesPerSecond = 9924.6
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69871875 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0262s; samplesPerSecond = 9530.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69114844 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9720.1
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68648047 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0273s; samplesPerSecond = 9161.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69657227 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0270s; samplesPerSecond = 9259.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71585547 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0264s; samplesPerSecond = 9486.2
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69730664 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0261s; samplesPerSecond = 9595.1
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70432422 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0244s; samplesPerSecond = 10248.8
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.69991797 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0220s; samplesPerSecond = 11388.0
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.68696875 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0222s; samplesPerSecond = 11277.0
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.67331445 * 250; EvalClassificationError = 0.37200000 * 250; time = 0.0245s; samplesPerSecond = 10192.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65711328 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0240s; samplesPerSecond = 10429.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64534375 * 250; EvalClassificationError = 0.44800000 * 250; time = 0.0243s; samplesPerSecond = 10305.0
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.61021875 * 250; EvalClassificationError = 0.36400000 * 250; time = 0.0236s; samplesPerSecond = 10606.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.54191016 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0236s; samplesPerSecond = 10578.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.45624414 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0232s; samplesPerSecond = 10762.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37636133 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0235s; samplesPerSecond = 10623.8
-05/03/2016 15:29:49: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68695688 * 10000; EvalClassificationError = 0.45550000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=1.06166s
+05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70511987 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0377s; samplesPerSecond = 6637.8
+05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69754895 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0300s; samplesPerSecond = 8341.4
+05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71056921 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0285s; samplesPerSecond = 8758.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72951074 * 250; EvalErrorPrediction = 0.56000000 * 250; time = 0.0290s; samplesPerSecond = 8610.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70946655 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0285s; samplesPerSecond = 8776.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72656787 * 250; EvalErrorPrediction = 0.54400000 * 250; time = 0.0289s; samplesPerSecond = 8652.6
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69337402 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0288s; samplesPerSecond = 8670.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73605176 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0277s; samplesPerSecond = 9033.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71453076 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0271s; samplesPerSecond = 9209.5
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75191992 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0247s; samplesPerSecond = 10134.6
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75975146 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0270s; samplesPerSecond = 9243.5
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73172168 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0268s; samplesPerSecond = 9333.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76840820 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0265s; samplesPerSecond = 9435.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70464746 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0269s; samplesPerSecond = 9309.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70557227 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0253s; samplesPerSecond = 9880.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72711816 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0267s; samplesPerSecond = 9357.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70076660 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0270s; samplesPerSecond = 9264.1
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69409766 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0257s; samplesPerSecond = 9716.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69139941 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0257s; samplesPerSecond = 9742.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73361621 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0295s; samplesPerSecond = 8477.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72225879 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0273s; samplesPerSecond = 9161.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70356348 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0261s; samplesPerSecond = 9562.8
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69928613 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0254s; samplesPerSecond = 9848.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72360938 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0252s; samplesPerSecond = 9924.6
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69871875 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0262s; samplesPerSecond = 9530.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69114844 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9720.1
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68648047 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0273s; samplesPerSecond = 9161.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69657227 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0270s; samplesPerSecond = 9259.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71585547 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0264s; samplesPerSecond = 9486.2
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69730664 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0261s; samplesPerSecond = 9595.1
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70432422 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0244s; samplesPerSecond = 10248.8
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.69991797 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0220s; samplesPerSecond = 11388.0
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.68696875 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0222s; samplesPerSecond = 11277.0
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.67331445 * 250; EvalErrorPrediction = 0.37200000 * 250; time = 0.0245s; samplesPerSecond = 10192.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65711328 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0240s; samplesPerSecond = 10429.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64534375 * 250; EvalErrorPrediction = 0.44800000 * 250; time = 0.0243s; samplesPerSecond = 10305.0
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.61021875 * 250; EvalErrorPrediction = 0.36400000 * 250; time = 0.0236s; samplesPerSecond = 10606.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.54191016 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0236s; samplesPerSecond = 10578.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.45624414 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0232s; samplesPerSecond = 10762.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37636133 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0235s; samplesPerSecond = 10623.8
+05/03/2016 15:29:49: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68695688 * 10000; EvalErrorPrediction = 0.45550000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=1.06166s
 05/03/2016 15:29:49: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn.1'

 05/03/2016 15:29:49: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:29:49: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:29:49:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.28780429 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0246s; samplesPerSecond = 10181.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.28222478 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0246s; samplesPerSecond = 10178.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.23589864 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0255s; samplesPerSecond = 9796.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.21209458 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0242s; samplesPerSecond = 10312.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.20285913 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10283.0
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.21300948 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0252s; samplesPerSecond = 9928.5
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.17835594 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0256s; samplesPerSecond = 9753.8
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.18830077 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0257s; samplesPerSecond = 9740.1
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.14198478 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0250s; samplesPerSecond = 10019.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.15895022 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0237s; samplesPerSecond = 10566.8
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.21062646 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0238s; samplesPerSecond = 10517.9
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.16081948 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0223s; samplesPerSecond = 11186.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15635713 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10700.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13008516 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0239s; samplesPerSecond = 10453.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16625347 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0234s; samplesPerSecond = 10674.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15001793 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0245s; samplesPerSecond = 10223.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22343917 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0234s; samplesPerSecond = 10692.4
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18006735 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0245s; samplesPerSecond = 10194.5
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15361620 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0235s; samplesPerSecond = 10636.9
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17039588 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0246s; samplesPerSecond = 10177.1
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15516786 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0237s; samplesPerSecond = 10544.1
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15969617 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0225s; samplesPerSecond = 11102.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15939439 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10697.9
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15300194 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0233s; samplesPerSecond = 10729.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14902476 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0231s; samplesPerSecond = 10811.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15043256 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0231s; samplesPerSecond = 10823.4
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15531360 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0229s; samplesPerSecond = 10936.1
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17990796 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0248s; samplesPerSecond = 10088.4
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22925668 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0229s; samplesPerSecond = 10913.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16843626 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0234s; samplesPerSecond = 10682.8
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18045325 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0236s; samplesPerSecond = 10585.6
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13337526 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0221s; samplesPerSecond = 11308.6
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14332977 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0245s; samplesPerSecond = 10219.9
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18749446 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0242s; samplesPerSecond = 10326.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15505967 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0236s; samplesPerSecond = 10587.8
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.19616616 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0228s; samplesPerSecond = 10980.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17305907 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0236s; samplesPerSecond = 10610.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15197365 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0249s; samplesPerSecond = 10033.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12102416 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0238s; samplesPerSecond = 10483.5
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15278496 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0235s; samplesPerSecond = 10646.9
-05/03/2016 15:29:50: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.17643784 * 10000; EvalClassificationError = 0.07560000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.957696s
+05/03/2016 15:29:49:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.28780429 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0246s; samplesPerSecond = 10181.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.28222478 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0246s; samplesPerSecond = 10178.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.23589864 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0255s; samplesPerSecond = 9796.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.21209458 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0242s; samplesPerSecond = 10312.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.20285913 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10283.0
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.21300948 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0252s; samplesPerSecond = 9928.5
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.17835594 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0256s; samplesPerSecond = 9753.8
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.18830077 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0257s; samplesPerSecond = 9740.1
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.14198478 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0250s; samplesPerSecond = 10019.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.15895022 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0237s; samplesPerSecond = 10566.8
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.21062646 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0238s; samplesPerSecond = 10517.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.16081948 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0223s; samplesPerSecond = 11186.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15635713 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10700.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13008516 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0239s; samplesPerSecond = 10453.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16625347 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0234s; samplesPerSecond = 10674.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15001793 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0245s; samplesPerSecond = 10223.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22343917 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0234s; samplesPerSecond = 10692.4
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18006735 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0245s; samplesPerSecond = 10194.5
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15361620 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0235s; samplesPerSecond = 10636.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17039588 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0246s; samplesPerSecond = 10177.1
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15516786 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0237s; samplesPerSecond = 10544.1
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15969617 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0225s; samplesPerSecond = 11102.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15939439 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10697.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15300194 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0233s; samplesPerSecond = 10729.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14902476 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0231s; samplesPerSecond = 10811.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15043256 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0231s; samplesPerSecond = 10823.4
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15531360 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0229s; samplesPerSecond = 10936.1
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17990796 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0248s; samplesPerSecond = 10088.4
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22925668 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0229s; samplesPerSecond = 10913.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16843626 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0234s; samplesPerSecond = 10682.8
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18045325 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0236s; samplesPerSecond = 10585.6
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13337526 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0221s; samplesPerSecond = 11308.6
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14332977 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0245s; samplesPerSecond = 10219.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18749446 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0242s; samplesPerSecond = 10326.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15505967 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0236s; samplesPerSecond = 10587.8
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.19616616 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0228s; samplesPerSecond = 10980.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17305907 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0236s; samplesPerSecond = 10610.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15197365 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0249s; samplesPerSecond = 10033.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12102416 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0238s; samplesPerSecond = 10483.5
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15278496 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0235s; samplesPerSecond = 10646.9
+05/03/2016 15:29:50: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.17643784 * 10000; EvalErrorPrediction = 0.07560000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.957696s
 05/03/2016 15:29:50: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn.2'

 05/03/2016 15:29:50: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:29:50: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:29:50:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10623312 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10637.4
-05/03/2016 15:29:50:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17519442 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0236s; samplesPerSecond = 10608.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14133983 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0240s; samplesPerSecond = 10404.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16278491 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0233s; samplesPerSecond = 10749.0
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11783558 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0232s; samplesPerSecond = 10780.0
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16342188 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0243s; samplesPerSecond = 10305.9
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16272195 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0239s; samplesPerSecond = 10476.9
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19401477 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0241s; samplesPerSecond = 10370.0
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20186661 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0229s; samplesPerSecond = 10903.2
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13672539 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10631.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20069212 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0234s; samplesPerSecond = 10681.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17729039 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9928.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15906107 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0251s; samplesPerSecond = 9941.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16281632 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0247s; samplesPerSecond = 10121.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19834981 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0248s; samplesPerSecond = 10067.7
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10217642 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0247s; samplesPerSecond = 10105.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17011383 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0258s; samplesPerSecond = 9692.2
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16599137 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9911.6
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12648996 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0254s; samplesPerSecond = 9848.7
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11920298 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0248s; samplesPerSecond = 10091.2
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12883164 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0272s; samplesPerSecond = 9205.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18222479 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0250s; samplesPerSecond = 9988.0
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13443351 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0246s; samplesPerSecond = 10149.4
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19720325 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0244s; samplesPerSecond = 10230.8
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15586137 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0254s; samplesPerSecond = 9860.4
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11854887 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0250s; samplesPerSecond = 9991.6
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13705285 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0249s; samplesPerSecond = 10050.7
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20009941 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0240s; samplesPerSecond = 10411.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.19078680 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0233s; samplesPerSecond = 10741.6
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16505705 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0238s; samplesPerSecond = 10507.7
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.12232722 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0239s; samplesPerSecond = 10472.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16342047 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0238s; samplesPerSecond = 10514.4
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.15875107 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10688.3
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12248772 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0232s; samplesPerSecond = 10793.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13457009 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0238s; samplesPerSecond = 10521.4
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20976565 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0238s; samplesPerSecond = 10494.9
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16519102 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0230s; samplesPerSecond = 10862.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14971420 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0247s; samplesPerSecond = 10106.3
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16456633 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0230s; samplesPerSecond = 10858.2
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16971407 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0239s; samplesPerSecond = 10473.0
-05/03/2016 15:29:51: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15787325 * 10000; EvalClassificationError = 0.07430000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.972052s
+05/03/2016 15:29:50:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10623312 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10637.4
+05/03/2016 15:29:50:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17519442 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0236s; samplesPerSecond = 10608.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14133983 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0240s; samplesPerSecond = 10404.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16278491 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0233s; samplesPerSecond = 10749.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11783558 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0232s; samplesPerSecond = 10780.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16342188 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0243s; samplesPerSecond = 10305.9
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16272195 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0239s; samplesPerSecond = 10476.9
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19401477 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0241s; samplesPerSecond = 10370.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20186661 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0229s; samplesPerSecond = 10903.2
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13672539 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10631.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20069212 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0234s; samplesPerSecond = 10681.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17729039 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9928.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15906107 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0251s; samplesPerSecond = 9941.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16281632 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0247s; samplesPerSecond = 10121.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19834981 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0248s; samplesPerSecond = 10067.7
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10217642 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0247s; samplesPerSecond = 10105.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17011383 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0258s; samplesPerSecond = 9692.2
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16599137 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9911.6
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12648996 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0254s; samplesPerSecond = 9848.7
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11920298 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0248s; samplesPerSecond = 10091.2
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12883164 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0272s; samplesPerSecond = 9205.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18222479 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0250s; samplesPerSecond = 9988.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13443351 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0246s; samplesPerSecond = 10149.4
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19720325 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0244s; samplesPerSecond = 10230.8
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15586137 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0254s; samplesPerSecond = 9860.4
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11854887 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0250s; samplesPerSecond = 9991.6
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13705285 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0249s; samplesPerSecond = 10050.7
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20009941 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0240s; samplesPerSecond = 10411.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.19078680 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0233s; samplesPerSecond = 10741.6
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16505705 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0238s; samplesPerSecond = 10507.7
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.12232722 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0239s; samplesPerSecond = 10472.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16342047 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0238s; samplesPerSecond = 10514.4
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.15875107 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10688.3
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12248772 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0232s; samplesPerSecond = 10793.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13457009 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0238s; samplesPerSecond = 10521.4
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20976565 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0238s; samplesPerSecond = 10494.9
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16519102 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0230s; samplesPerSecond = 10862.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14971420 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0247s; samplesPerSecond = 10106.3
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16456633 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0230s; samplesPerSecond = 10858.2
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16971407 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0239s; samplesPerSecond = 10473.0
+05/03/2016 15:29:51: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15787325 * 10000; EvalErrorPrediction = 0.07430000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.972052s
 05/03/2016 15:29:51: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn'
 05/03/2016 15:29:51: CNTKCommandTrainEnd: Multigpu_Demo_Train

@ -621,7 +621,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -650,7 +650,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -674,7 +674,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 000000CDDFC7B490: {[W0 Value[50 x 2]] }
 000000CDDFC7B530: {[features Value[2 x *1]] }
 000000CDDFC7B710: {[W1 Value[50 x 50]] }
@ -690,7 +690,7 @@ Memory Sharing Structure:
 000000CDDFC8C2B0: {[W1*H1+B1 Value[50 x 1 x *1]] }
 000000CDDFC8C490: {[CrossEntropyWithSoftmax Value[1]] }
 000000CDDFC8C5D0: {[LogOfPrior Value[2]] }
-000000CDDFC8C670: {[EvalClassificationError Value[1]] }
+000000CDDFC8C670: {[EvalErrorPrediction Value[1]] }
 000000CDDFC8C990: {[MVNormalizedFeatures Value[2 x *1]] }
 000000CDDFC8CA30: {[H2 Value[50 x 1 x *1]] }
 000000CDDFC8CC10: {[W1*H1 Value[50 x 1 x *1]] }
@ -699,7 +699,7 @@ Memory Sharing Structure:
 000000CDDFC8D610: {[HLast Value[2 x 1 x *1]] }
 000000CDDFC8D750: {[W0*features+B0 Value[50 x 1 x *1]] }

-05/03/2016 15:29:52: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05306799 * 603; CrossEntropyWithSoftmax = 0.11782631 * 603; perplexity = 1.12504868
+05/03/2016 15:29:52: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05306799 * 603; CrossEntropyWithSoftmax = 0.11782631 * 603; perplexity = 1.12504868

 05/03/2016 15:29:52: Action "test" complete.

--- a/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.windows.gpu.txt
@ -66,7 +66,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -167,7 +167,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -300,7 +300,7 @@ configparameters: Multigpu.cntk:Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -369,7 +369,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -398,7 +398,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -422,14 +422,14 @@ Post-processing network complete.

 05/03/2016 15:29:53: Evaluation criterion node(s):

-05/03/2016 15:29:53: 	EvalClassificationError = ClassificationError
+05/03/2016 15:29:53: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+0000000000000000: {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 000000572B66ECA0: {[features Value[2 x *]] }
 00000057420A1700: {[W1 Value[50 x 50]] }
 00000057420A1980: {[MeanOfFeatures Value[2]] }
@ -448,7 +448,7 @@ Memory Sharing Structure:
 00000057439283E0: {[LogOfPrior Value[2]] }
 00000057439285C0: {[W0 Gradient[50 x 2]] [W0*features+B0 Value[50 x 1 x *]] }
 0000005743928660: {[B1 Gradient[50 x 1]] [H2 Gradient[50 x 1 x *]] [HLast Gradient[2 x 1 x *]] }
-00000057439287A0: {[EvalClassificationError Value[1]] }
+00000057439287A0: {[EvalErrorPrediction Value[1]] }
 0000005743928980: {[CrossEntropyWithSoftmax Value[1]] }
 0000005743928A20: {[B2 Gradient[2 x 1]] }
 0000005743928E80: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
@ -472,139 +472,139 @@ Memory Sharing Structure:
 05/03/2016 15:29:54: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:29:54: Starting minibatch loop.
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70650452 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0115s; samplesPerSecond = 21832.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69701831 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0095s; samplesPerSecond = 26326.9
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71089587 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0100s; samplesPerSecond = 25067.7
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72980273 * 250; EvalClassificationError = 0.56000000 * 250; time = 0.0096s; samplesPerSecond = 26079.7
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70902783 * 250; EvalClassificationError = 0.52800000 * 250; time = 0.0115s; samplesPerSecond = 21692.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72657300 * 250; EvalClassificationError = 0.54400000 * 250; time = 0.0124s; samplesPerSecond = 20127.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69319678 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0091s; samplesPerSecond = 27439.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73563477 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0112s; samplesPerSecond = 22246.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71463281 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0115s; samplesPerSecond = 21739.1
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75213428 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0105s; samplesPerSecond = 23814.1
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75931445 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0115s; samplesPerSecond = 21763.7
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73075293 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0120s; samplesPerSecond = 20835.1
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76701953 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0130s; samplesPerSecond = 19305.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70451270 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0108s; samplesPerSecond = 23184.6
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70539941 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0117s; samplesPerSecond = 21385.8
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72700293 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0120s; samplesPerSecond = 20917.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70096191 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0112s; samplesPerSecond = 22301.5
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69437305 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0113s; samplesPerSecond = 22079.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69161621 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0116s; samplesPerSecond = 21514.6
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73388281 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0107s; samplesPerSecond = 23406.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72255664 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0116s; samplesPerSecond = 21546.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70414551 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0115s; samplesPerSecond = 21756.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69976758 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0113s; samplesPerSecond = 22065.3
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72419141 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0114s; samplesPerSecond = 22018.7
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69943945 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0111s; samplesPerSecond = 22604.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69206445 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0111s; samplesPerSecond = 22504.3
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68771680 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0113s; samplesPerSecond = 22118.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69878516 * 250; EvalClassificationError = 0.44000000 * 250; time = 0.0130s; samplesPerSecond = 19278.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71889844 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0127s; samplesPerSecond = 19632.5
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.70086523 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0095s; samplesPerSecond = 26329.6
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70878320 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0112s; samplesPerSecond = 22361.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.70674414 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0130s; samplesPerSecond = 19168.8
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69707422 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0094s; samplesPerSecond = 26729.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68588281 * 250; EvalClassificationError = 0.40800000 * 250; time = 0.0112s; samplesPerSecond = 22365.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.67734766 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0128s; samplesPerSecond = 19583.3
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.67958008 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0092s; samplesPerSecond = 27144.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.66424805 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0114s; samplesPerSecond = 21864.6
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.62412500 * 250; EvalClassificationError = 0.20400000 * 250; time = 0.0116s; samplesPerSecond = 21475.8
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.58007422 * 250; EvalClassificationError = 0.16000000 * 250; time = 0.0094s; samplesPerSecond = 26567.5
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.52764648 * 250; EvalClassificationError = 0.19200000 * 250; time = 0.0132s; samplesPerSecond = 18988.3
-05/03/2016 15:29:54: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.69975483 * 10000; EvalClassificationError = 0.46850000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.453807s
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70650452 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0115s; samplesPerSecond = 21832.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69701831 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0095s; samplesPerSecond = 26326.9
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71089587 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0100s; samplesPerSecond = 25067.7
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72980273 * 250; EvalErrorPrediction = 0.56000000 * 250; time = 0.0096s; samplesPerSecond = 26079.7
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70902783 * 250; EvalErrorPrediction = 0.52800000 * 250; time = 0.0115s; samplesPerSecond = 21692.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72657300 * 250; EvalErrorPrediction = 0.54400000 * 250; time = 0.0124s; samplesPerSecond = 20127.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69319678 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0091s; samplesPerSecond = 27439.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73563477 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0112s; samplesPerSecond = 22246.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71463281 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0115s; samplesPerSecond = 21739.1
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75213428 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0105s; samplesPerSecond = 23814.1
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75931445 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0115s; samplesPerSecond = 21763.7
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73075293 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0120s; samplesPerSecond = 20835.1
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76701953 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0130s; samplesPerSecond = 19305.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70451270 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0108s; samplesPerSecond = 23184.6
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70539941 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0117s; samplesPerSecond = 21385.8
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72700293 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0120s; samplesPerSecond = 20917.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70096191 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0112s; samplesPerSecond = 22301.5
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69437305 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0113s; samplesPerSecond = 22079.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69161621 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0116s; samplesPerSecond = 21514.6
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73388281 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0107s; samplesPerSecond = 23406.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72255664 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0116s; samplesPerSecond = 21546.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70414551 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0115s; samplesPerSecond = 21756.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69976758 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0113s; samplesPerSecond = 22065.3
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72419141 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0114s; samplesPerSecond = 22018.7
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69943945 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0111s; samplesPerSecond = 22604.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69206445 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0111s; samplesPerSecond = 22504.3
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68771680 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0113s; samplesPerSecond = 22118.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69878516 * 250; EvalErrorPrediction = 0.44000000 * 250; time = 0.0130s; samplesPerSecond = 19278.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71889844 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0127s; samplesPerSecond = 19632.5
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.70086523 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0095s; samplesPerSecond = 26329.6
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70878320 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0112s; samplesPerSecond = 22361.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.70674414 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0130s; samplesPerSecond = 19168.8
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69707422 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0094s; samplesPerSecond = 26729.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68588281 * 250; EvalErrorPrediction = 0.40800000 * 250; time = 0.0112s; samplesPerSecond = 22365.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.67734766 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0128s; samplesPerSecond = 19583.3
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.67958008 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0092s; samplesPerSecond = 27144.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.66424805 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0114s; samplesPerSecond = 21864.6
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.62412500 * 250; EvalErrorPrediction = 0.20400000 * 250; time = 0.0116s; samplesPerSecond = 21475.8
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.58007422 * 250; EvalErrorPrediction = 0.16000000 * 250; time = 0.0094s; samplesPerSecond = 26567.5
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.52764648 * 250; EvalErrorPrediction = 0.19200000 * 250; time = 0.0132s; samplesPerSecond = 18988.3
+05/03/2016 15:29:54: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.69975483 * 10000; EvalErrorPrediction = 0.46850000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.453807s
 05/03/2016 15:29:54: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn.1'

 05/03/2016 15:29:54: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:29:54: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.45075654 * 250; EvalClassificationError = 0.15200000 * 250; time = 0.0250s; samplesPerSecond = 10002.4
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.40775497 * 250; EvalClassificationError = 0.14400000 * 250; time = 0.0219s; samplesPerSecond = 11420.2
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.34165228 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0230s; samplesPerSecond = 10859.6
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.29708900 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0198s; samplesPerSecond = 12604.0
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.26669365 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0211s; samplesPerSecond = 11860.7
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.25328680 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0212s; samplesPerSecond = 11817.0
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.21017820 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0237s; samplesPerSecond = 10540.1
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.21483054 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0214s; samplesPerSecond = 11699.7
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.16626513 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0213s; samplesPerSecond = 11757.5
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.17672434 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0239s; samplesPerSecond = 10454.6
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.22140190 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0208s; samplesPerSecond = 12033.1
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17048554 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0237s; samplesPerSecond = 10553.4
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16438517 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10662.3
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13782141 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0218s; samplesPerSecond = 11449.0
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16909663 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0244s; samplesPerSecond = 10228.7
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15419129 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0229s; samplesPerSecond = 10924.7
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22229924 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0242s; samplesPerSecond = 10340.4
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18134995 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0236s; samplesPerSecond = 10579.3
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15616904 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0236s; samplesPerSecond = 10594.6
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17162733 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0262s; samplesPerSecond = 9530.3
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15676289 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0262s; samplesPerSecond = 9554.4
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16159542 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0262s; samplesPerSecond = 9558.8
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.16102246 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0284s; samplesPerSecond = 8800.3
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15392923 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0248s; samplesPerSecond = 10089.6
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14898334 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0269s; samplesPerSecond = 9279.5
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15087969 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0285s; samplesPerSecond = 8785.2
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15494578 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0247s; samplesPerSecond = 10101.4
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17878713 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0250s; samplesPerSecond = 9986.0
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22845049 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0249s; samplesPerSecond = 10045.4
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16884430 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0241s; samplesPerSecond = 10376.5
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17970282 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0237s; samplesPerSecond = 10533.9
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13292468 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0257s; samplesPerSecond = 9721.6
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14167778 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0226s; samplesPerSecond = 11048.3
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18716852 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0237s; samplesPerSecond = 10534.7
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15480385 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0258s; samplesPerSecond = 9705.0
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.19482328 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0247s; samplesPerSecond = 10115.7
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17488171 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0249s; samplesPerSecond = 10048.2
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15164433 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0281s; samplesPerSecond = 8901.2
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12142463 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0222s; samplesPerSecond = 11279.0
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15287631 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0238s; samplesPerSecond = 10489.7
-05/03/2016 15:29:55: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.19475469 * 10000; EvalClassificationError = 0.07830000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.964496s
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.45075654 * 250; EvalErrorPrediction = 0.15200000 * 250; time = 0.0250s; samplesPerSecond = 10002.4
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.40775497 * 250; EvalErrorPrediction = 0.14400000 * 250; time = 0.0219s; samplesPerSecond = 11420.2
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.34165228 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0230s; samplesPerSecond = 10859.6
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.29708900 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0198s; samplesPerSecond = 12604.0
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.26669365 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0211s; samplesPerSecond = 11860.7
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.25328680 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0212s; samplesPerSecond = 11817.0
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.21017820 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0237s; samplesPerSecond = 10540.1
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.21483054 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0214s; samplesPerSecond = 11699.7
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.16626513 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0213s; samplesPerSecond = 11757.5
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.17672434 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0239s; samplesPerSecond = 10454.6
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.22140190 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0208s; samplesPerSecond = 12033.1
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17048554 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0237s; samplesPerSecond = 10553.4
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16438517 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10662.3
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13782141 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0218s; samplesPerSecond = 11449.0
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16909663 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0244s; samplesPerSecond = 10228.7
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15419129 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0229s; samplesPerSecond = 10924.7
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22229924 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0242s; samplesPerSecond = 10340.4
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18134995 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0236s; samplesPerSecond = 10579.3
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15616904 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0236s; samplesPerSecond = 10594.6
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17162733 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0262s; samplesPerSecond = 9530.3
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15676289 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0262s; samplesPerSecond = 9554.4
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16159542 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0262s; samplesPerSecond = 9558.8
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.16102246 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0284s; samplesPerSecond = 8800.3
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15392923 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0248s; samplesPerSecond = 10089.6
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14898334 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0269s; samplesPerSecond = 9279.5
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15087969 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0285s; samplesPerSecond = 8785.2
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15494578 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0247s; samplesPerSecond = 10101.4
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17878713 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0250s; samplesPerSecond = 9986.0
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22845049 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0249s; samplesPerSecond = 10045.4
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16884430 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0241s; samplesPerSecond = 10376.5
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17970282 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0237s; samplesPerSecond = 10533.9
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13292468 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0257s; samplesPerSecond = 9721.6
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14167778 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0226s; samplesPerSecond = 11048.3
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18716852 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0237s; samplesPerSecond = 10534.7
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15480385 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0258s; samplesPerSecond = 9705.0
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.19482328 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0247s; samplesPerSecond = 10115.7
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17488171 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0249s; samplesPerSecond = 10048.2
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15164433 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0281s; samplesPerSecond = 8901.2
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12142463 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0222s; samplesPerSecond = 11279.0
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15287631 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0238s; samplesPerSecond = 10489.7
+05/03/2016 15:29:55: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.19475469 * 10000; EvalErrorPrediction = 0.07830000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.964496s
 05/03/2016 15:29:55: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn.2'

 05/03/2016 15:29:55: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:29:55: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10717578 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0253s; samplesPerSecond = 9869.7
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17521929 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0234s; samplesPerSecond = 10701.1
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14088211 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0250s; samplesPerSecond = 9986.8
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16281337 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10287.6
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11778386 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0234s; samplesPerSecond = 10666.9
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16295400 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0266s; samplesPerSecond = 9385.8
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16287201 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0233s; samplesPerSecond = 10746.2
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19482140 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0242s; samplesPerSecond = 10312.3
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20113689 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0235s; samplesPerSecond = 10643.3
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13748570 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0238s; samplesPerSecond = 10484.4
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20080420 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0236s; samplesPerSecond = 10600.9
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17730590 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0268s; samplesPerSecond = 9342.3
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15851029 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0233s; samplesPerSecond = 10743.0
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16257260 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0250s; samplesPerSecond = 10012.8
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19772537 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0224s; samplesPerSecond = 11143.3
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10259204 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0235s; samplesPerSecond = 10626.1
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17093073 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0244s; samplesPerSecond = 10230.0
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16628544 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9936.8
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12690716 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0246s; samplesPerSecond = 10171.7
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11894288 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0233s; samplesPerSecond = 10718.1
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12815907 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0246s; samplesPerSecond = 10151.0
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18265773 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0225s; samplesPerSecond = 11131.9
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13388730 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0231s; samplesPerSecond = 10807.5
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19787903 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0251s; samplesPerSecond = 9951.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15563315 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0241s; samplesPerSecond = 10373.0
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11837055 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0240s; samplesPerSecond = 10429.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13732942 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10689.7
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20012115 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0253s; samplesPerSecond = 9872.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.19086846 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0238s; samplesPerSecond = 10525.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16492589 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10272.8
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.12141157 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0238s; samplesPerSecond = 10509.5
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16335481 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0236s; samplesPerSecond = 10579.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.15923900 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0241s; samplesPerSecond = 10358.0
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12315803 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10617.1
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13481532 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0260s; samplesPerSecond = 9612.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20958008 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0223s; samplesPerSecond = 11232.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16519713 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0255s; samplesPerSecond = 9814.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14990733 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0239s; samplesPerSecond = 10481.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16508552 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0255s; samplesPerSecond = 9789.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16941540 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0240s; samplesPerSecond = 10435.4
-05/03/2016 15:29:56: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15791792 * 10000; EvalClassificationError = 0.07460000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.970059s
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10717578 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0253s; samplesPerSecond = 9869.7
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17521929 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0234s; samplesPerSecond = 10701.1
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14088211 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0250s; samplesPerSecond = 9986.8
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16281337 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10287.6
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11778386 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0234s; samplesPerSecond = 10666.9
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16295400 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0266s; samplesPerSecond = 9385.8
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16287201 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0233s; samplesPerSecond = 10746.2
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19482140 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0242s; samplesPerSecond = 10312.3
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20113689 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0235s; samplesPerSecond = 10643.3
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13748570 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0238s; samplesPerSecond = 10484.4
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20080420 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0236s; samplesPerSecond = 10600.9
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17730590 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0268s; samplesPerSecond = 9342.3
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15851029 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0233s; samplesPerSecond = 10743.0
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16257260 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0250s; samplesPerSecond = 10012.8
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19772537 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0224s; samplesPerSecond = 11143.3
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10259204 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0235s; samplesPerSecond = 10626.1
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17093073 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0244s; samplesPerSecond = 10230.0
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16628544 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9936.8
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12690716 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0246s; samplesPerSecond = 10171.7
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11894288 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0233s; samplesPerSecond = 10718.1
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12815907 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0246s; samplesPerSecond = 10151.0
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18265773 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0225s; samplesPerSecond = 11131.9
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13388730 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0231s; samplesPerSecond = 10807.5
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19787903 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0251s; samplesPerSecond = 9951.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15563315 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0241s; samplesPerSecond = 10373.0
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11837055 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0240s; samplesPerSecond = 10429.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13732942 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10689.7
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20012115 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0253s; samplesPerSecond = 9872.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.19086846 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0238s; samplesPerSecond = 10525.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16492589 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10272.8
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.12141157 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0238s; samplesPerSecond = 10509.5
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16335481 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0236s; samplesPerSecond = 10579.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.15923900 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0241s; samplesPerSecond = 10358.0
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12315803 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10617.1
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13481532 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0260s; samplesPerSecond = 9612.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20958008 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0223s; samplesPerSecond = 11232.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16519713 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0255s; samplesPerSecond = 9814.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14990733 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0239s; samplesPerSecond = 10481.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16508552 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0255s; samplesPerSecond = 9789.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16941540 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0240s; samplesPerSecond = 10435.4
+05/03/2016 15:29:56: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15791792 * 10000; EvalErrorPrediction = 0.07460000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.970059s
 05/03/2016 15:29:56: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn'
 05/03/2016 15:29:56: CNTKCommandTrainEnd: Multigpu_Demo_Train

@ -622,7 +622,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -651,7 +651,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -675,7 +675,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0000005743925BB0: {[HLast Value[2 x 1 x *1]] }
 0000005743925D90: {[MVNormalizedFeatures Value[2 x *1]] }
 0000005743925E30: {[CrossEntropyWithSoftmax Value[1]] }
@ -688,7 +688,7 @@ Memory Sharing Structure:
 00000057439265B0: {[W0*features+B0 Value[50 x 1 x *1]] }
 0000005743926650: {[W1*H1 Value[50 x 1 x *1]] }
 0000005743926970: {[H2 Value[50 x 1 x *1]] }
-0000005743926AB0: {[EvalClassificationError Value[1]] }
+0000005743926AB0: {[EvalErrorPrediction Value[1]] }
 000000574B7FAD10: {[W0 Value[50 x 2]] }
 000000574B7FB170: {[InvStdOfFeatures Value[2]] }
 000000574B7FB210: {[MeanOfFeatures Value[2]] }
@ -700,7 +700,7 @@ Memory Sharing Structure:
 000000574D960E50: {[B2 Value[2 x 1]] }
 000000574D9610D0: {[B0 Value[50 x 1]] }

-05/03/2016 15:29:56: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12022919 * 603; perplexity = 1.12775529
+05/03/2016 15:29:56: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12022919 * 603; perplexity = 1.12775529

 05/03/2016 15:29:56: Action "test" complete.

--- a/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.linux.cpu.txt
@ -58,7 +58,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -157,7 +157,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -300,7 +300,7 @@ configparameters: Simple.cntk:Simple_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -355,7 +355,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -384,7 +384,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -408,14 +408,14 @@ Post-processing network complete.

 05/03/2016 15:21:15: Evaluation criterion node(s):

-05/03/2016 15:21:15: 	EvalClassificationError = ClassificationError
+05/03/2016 15:21:15: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+(nil): {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 0x2e7f338: {[features Value[2 x *]] }
 0x2e82908: {[MeanOfFeatures Value[2]] }
 0x2e84f08: {[InvStdOfFeatures Value[2]] }
@ -427,7 +427,7 @@ Memory Sharing Structure:
 0x2e8b718: {[B2 Value[2 x 1]] }
 0x2e8c1e8: {[labels Value[2 x *]] }
 0x2e8cf38: {[Prior Value[2]] }
-0x2e926f8: {[EvalClassificationError Value[1]] }
+0x2e926f8: {[EvalErrorPrediction Value[1]] }
 0x2e92858: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 0x2e929b8: {[CrossEntropyWithSoftmax Value[1]] }
 0x2e93218: {[LogOfPrior Value[2]] }
@ -458,139 +458,139 @@ Memory Sharing Structure:
 05/03/2016 15:21:17: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:17: Starting minibatch loop.
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.69966235 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0806s; samplesPerSecond = 3103.4
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70639648 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0489s; samplesPerSecond = 5107.5
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70470264 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0598s; samplesPerSecond = 4180.0
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69813501 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0581s; samplesPerSecond = 4306.3
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73551416 * 250; EvalClassificationError = 0.57600000 * 250; time = 0.0618s; samplesPerSecond = 4045.4
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72432324 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0579s; samplesPerSecond = 4314.7
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73327588 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.2699s; samplesPerSecond = 926.3
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70092627 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0620s; samplesPerSecond = 4035.0
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72354980 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0826s; samplesPerSecond = 3027.2
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72148096 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0811s; samplesPerSecond = 3082.2
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69814941 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0895s; samplesPerSecond = 2793.1
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70699121 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0482s; samplesPerSecond = 5187.9
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69898437 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0567s; samplesPerSecond = 4408.3
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71712695 * 250; EvalClassificationError = 0.54000000 * 250; time = 0.0586s; samplesPerSecond = 4266.7
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69470703 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0546s; samplesPerSecond = 4575.3
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71375879 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0640s; samplesPerSecond = 3907.4
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70381641 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0756s; samplesPerSecond = 3307.9
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71748633 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0598s; samplesPerSecond = 4178.1
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71863281 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0813s; samplesPerSecond = 3075.3
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70715234 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0811s; samplesPerSecond = 3082.9
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70401074 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0673s; samplesPerSecond = 3717.1
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70599414 * 250; EvalClassificationError = 0.48400000 * 250; time = 0.0819s; samplesPerSecond = 3052.5
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69628711 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0909s; samplesPerSecond = 2749.3
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75920898 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0752s; samplesPerSecond = 3323.1
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70542578 * 250; EvalClassificationError = 0.43600000 * 250; time = 0.0734s; samplesPerSecond = 3406.2
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70643945 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0869s; samplesPerSecond = 2875.4
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72481641 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0893s; samplesPerSecond = 2798.7
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71133594 * 250; EvalClassificationError = 0.55600000 * 250; time = 0.0814s; samplesPerSecond = 3072.2
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68605664 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0812s; samplesPerSecond = 3077.4
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69535352 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0895s; samplesPerSecond = 2792.1
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.68741797 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0831s; samplesPerSecond = 3008.7
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.67916406 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0818s; samplesPerSecond = 3056.5
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.67841992 * 250; EvalClassificationError = 0.44800000 * 250; time = 0.2681s; samplesPerSecond = 932.5
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68038477 * 250; EvalClassificationError = 0.49200000 * 250; time = 0.0513s; samplesPerSecond = 4869.4
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.61937109 * 250; EvalClassificationError = 0.30400000 * 250; time = 0.0680s; samplesPerSecond = 3678.3
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.57844141 * 250; EvalClassificationError = 0.27200000 * 250; time = 0.0758s; samplesPerSecond = 3296.3
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.49124023 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0664s; samplesPerSecond = 3763.4
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.39071289 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0505s; samplesPerSecond = 4955.3
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.27650586 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0515s; samplesPerSecond = 4855.7
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.26430078 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0517s; samplesPerSecond = 4834.4
-05/03/2016 15:21:20: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.66664150 * 10000; EvalClassificationError = 0.44430000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=3.21314s
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.69966235 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0806s; samplesPerSecond = 3103.4
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70639648 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0489s; samplesPerSecond = 5107.5
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70470264 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0598s; samplesPerSecond = 4180.0
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69813501 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0581s; samplesPerSecond = 4306.3
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73551416 * 250; EvalErrorPrediction = 0.57600000 * 250; time = 0.0618s; samplesPerSecond = 4045.4
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72432324 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0579s; samplesPerSecond = 4314.7
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73327588 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.2699s; samplesPerSecond = 926.3
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70092627 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0620s; samplesPerSecond = 4035.0
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72354980 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0826s; samplesPerSecond = 3027.2
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72148096 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0811s; samplesPerSecond = 3082.2
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69814941 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0895s; samplesPerSecond = 2793.1
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70699121 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0482s; samplesPerSecond = 5187.9
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69898437 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0567s; samplesPerSecond = 4408.3
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71712695 * 250; EvalErrorPrediction = 0.54000000 * 250; time = 0.0586s; samplesPerSecond = 4266.7
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69470703 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0546s; samplesPerSecond = 4575.3
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71375879 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0640s; samplesPerSecond = 3907.4
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70381641 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0756s; samplesPerSecond = 3307.9
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71748633 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0598s; samplesPerSecond = 4178.1
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71863281 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0813s; samplesPerSecond = 3075.3
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70715234 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0811s; samplesPerSecond = 3082.9
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70401074 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0673s; samplesPerSecond = 3717.1
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70599414 * 250; EvalErrorPrediction = 0.48400000 * 250; time = 0.0819s; samplesPerSecond = 3052.5
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69628711 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0909s; samplesPerSecond = 2749.3
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75920898 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0752s; samplesPerSecond = 3323.1
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70542578 * 250; EvalErrorPrediction = 0.43600000 * 250; time = 0.0734s; samplesPerSecond = 3406.2
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70643945 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0869s; samplesPerSecond = 2875.4
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72481641 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0893s; samplesPerSecond = 2798.7
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71133594 * 250; EvalErrorPrediction = 0.55600000 * 250; time = 0.0814s; samplesPerSecond = 3072.2
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68605664 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0812s; samplesPerSecond = 3077.4
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69535352 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0895s; samplesPerSecond = 2792.1
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.68741797 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0831s; samplesPerSecond = 3008.7
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.67916406 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0818s; samplesPerSecond = 3056.5
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.67841992 * 250; EvalErrorPrediction = 0.44800000 * 250; time = 0.2681s; samplesPerSecond = 932.5
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68038477 * 250; EvalErrorPrediction = 0.49200000 * 250; time = 0.0513s; samplesPerSecond = 4869.4
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.61937109 * 250; EvalErrorPrediction = 0.30400000 * 250; time = 0.0680s; samplesPerSecond = 3678.3
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.57844141 * 250; EvalErrorPrediction = 0.27200000 * 250; time = 0.0758s; samplesPerSecond = 3296.3
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.49124023 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0664s; samplesPerSecond = 3763.4
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.39071289 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0505s; samplesPerSecond = 4955.3
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.27650586 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0515s; samplesPerSecond = 4855.7
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.26430078 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0517s; samplesPerSecond = 4834.4
+05/03/2016 15:21:20: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.66664150 * 10000; EvalErrorPrediction = 0.44430000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=3.21314s
 05/03/2016 15:21:20: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_cpu/Models/simple.dnn.1'

 05/03/2016 15:21:20: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:20: Starting minibatch loop.
-05/03/2016 15:21:20:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.20732678 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0782s; samplesPerSecond = 3196.0
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.19684015 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0812s; samplesPerSecond = 3079.4
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.16083588 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0796s; samplesPerSecond = 3141.3
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13558752 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0811s; samplesPerSecond = 3083.5
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17992950 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0814s; samplesPerSecond = 3070.9
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17858063 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0812s; samplesPerSecond = 3079.3
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16847546 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0688s; samplesPerSecond = 3631.6
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16359399 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0547s; samplesPerSecond = 4572.7
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19534705 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0521s; samplesPerSecond = 4796.2
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19363660 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0758s; samplesPerSecond = 3297.5
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.12703638 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0682s; samplesPerSecond = 3667.7
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.18622827 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0576s; samplesPerSecond = 4344.0
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.11595044 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0599s; samplesPerSecond = 4171.2
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16689380 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0650s; samplesPerSecond = 3845.2
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.15822559 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0631s; samplesPerSecond = 3964.2
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18381909 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0638s; samplesPerSecond = 3920.5
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18274048 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0642s; samplesPerSecond = 3893.2
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18638428 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0564s; samplesPerSecond = 4431.5
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20111572 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0528s; samplesPerSecond = 4733.8
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13185034 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0504s; samplesPerSecond = 4962.1
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13692554 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0559s; samplesPerSecond = 4468.8
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15396802 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0672s; samplesPerSecond = 3719.4
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15347241 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0818s; samplesPerSecond = 3057.6
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14583887 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.2662s; samplesPerSecond = 939.1
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12333276 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0738s; samplesPerSecond = 3389.0
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.13958154 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0778s; samplesPerSecond = 3211.3
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12539844 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0772s; samplesPerSecond = 3239.1
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.19014404 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0475s; samplesPerSecond = 5259.1
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17959521 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0780s; samplesPerSecond = 3206.4
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18899121 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0469s; samplesPerSecond = 5333.6
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17525586 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0625s; samplesPerSecond = 4003.1
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14735645 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0940s; samplesPerSecond = 2658.9
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13705518 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0543s; samplesPerSecond = 4600.2
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.13610693 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0752s; samplesPerSecond = 3324.2
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13555811 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0583s; samplesPerSecond = 4291.1
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.14883594 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0598s; samplesPerSecond = 4180.7
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14724707 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0599s; samplesPerSecond = 4172.4
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13130469 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0664s; samplesPerSecond = 3764.2
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19636084 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0644s; samplesPerSecond = 3884.1
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15681836 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0651s; samplesPerSecond = 3841.0
-05/03/2016 15:21:23: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16173864 * 10000; EvalClassificationError = 0.07520000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=2.87283s
+05/03/2016 15:21:20:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.20732678 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0782s; samplesPerSecond = 3196.0
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.19684015 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0812s; samplesPerSecond = 3079.4
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.16083588 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0796s; samplesPerSecond = 3141.3
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13558752 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0811s; samplesPerSecond = 3083.5
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17992950 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0814s; samplesPerSecond = 3070.9
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17858063 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0812s; samplesPerSecond = 3079.3
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16847546 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0688s; samplesPerSecond = 3631.6
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16359399 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0547s; samplesPerSecond = 4572.7
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19534705 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0521s; samplesPerSecond = 4796.2
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19363660 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0758s; samplesPerSecond = 3297.5
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.12703638 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0682s; samplesPerSecond = 3667.7
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.18622827 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0576s; samplesPerSecond = 4344.0
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.11595044 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0599s; samplesPerSecond = 4171.2
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16689380 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0650s; samplesPerSecond = 3845.2
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.15822559 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0631s; samplesPerSecond = 3964.2
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18381909 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0638s; samplesPerSecond = 3920.5
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18274048 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0642s; samplesPerSecond = 3893.2
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18638428 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0564s; samplesPerSecond = 4431.5
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20111572 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0528s; samplesPerSecond = 4733.8
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13185034 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0504s; samplesPerSecond = 4962.1
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13692554 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0559s; samplesPerSecond = 4468.8
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15396802 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0672s; samplesPerSecond = 3719.4
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15347241 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0818s; samplesPerSecond = 3057.6
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14583887 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.2662s; samplesPerSecond = 939.1
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12333276 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0738s; samplesPerSecond = 3389.0
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.13958154 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0778s; samplesPerSecond = 3211.3
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12539844 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0772s; samplesPerSecond = 3239.1
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.19014404 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0475s; samplesPerSecond = 5259.1
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17959521 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0780s; samplesPerSecond = 3206.4
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18899121 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0469s; samplesPerSecond = 5333.6
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17525586 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0625s; samplesPerSecond = 4003.1
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14735645 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0940s; samplesPerSecond = 2658.9
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13705518 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0543s; samplesPerSecond = 4600.2
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.13610693 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0752s; samplesPerSecond = 3324.2
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13555811 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0583s; samplesPerSecond = 4291.1
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.14883594 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0598s; samplesPerSecond = 4180.7
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14724707 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0599s; samplesPerSecond = 4172.4
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13130469 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0664s; samplesPerSecond = 3764.2
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19636084 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0644s; samplesPerSecond = 3884.1
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15681836 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0651s; samplesPerSecond = 3841.0
+05/03/2016 15:21:23: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16173864 * 10000; EvalErrorPrediction = 0.07520000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=2.87283s
 05/03/2016 15:21:23: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_cpu/Models/simple.dnn.2'

 05/03/2016 15:21:23: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:23: Starting minibatch loop.
-05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18214960 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0604s; samplesPerSecond = 4138.7
-05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13526825 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0622s; samplesPerSecond = 4020.6
-05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14344995 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0640s; samplesPerSecond = 3906.0
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.12557471 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0628s; samplesPerSecond = 3978.7
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17627924 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0639s; samplesPerSecond = 3914.6
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17585291 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0644s; samplesPerSecond = 3884.2
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.14716791 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0628s; samplesPerSecond = 3979.1
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16757751 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0643s; samplesPerSecond = 3885.5
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10314917 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0642s; samplesPerSecond = 3895.3
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20322217 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0650s; samplesPerSecond = 3848.0
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16604797 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0642s; samplesPerSecond = 3892.3
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.15105725 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0651s; samplesPerSecond = 3839.4
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19206934 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0640s; samplesPerSecond = 3903.9
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13667065 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.2688s; samplesPerSecond = 930.0
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.20713037 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0472s; samplesPerSecond = 5299.3
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.12862158 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0625s; samplesPerSecond = 3998.5
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17174683 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0465s; samplesPerSecond = 5381.7
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16493628 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0526s; samplesPerSecond = 4753.8
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14843726 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0505s; samplesPerSecond = 4952.5
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12574292 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0505s; samplesPerSecond = 4951.4
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13455151 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0614s; samplesPerSecond = 4072.8
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16762988 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0495s; samplesPerSecond = 5055.0
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22347461 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0523s; samplesPerSecond = 4780.1
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18213623 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0542s; samplesPerSecond = 4611.6
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.19970923 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0539s; samplesPerSecond = 4638.8
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22695947 * 250; EvalClassificationError = 0.12800000 * 250; time = 0.0542s; samplesPerSecond = 4609.7
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12664502 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0541s; samplesPerSecond = 4625.3
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15838037 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0538s; samplesPerSecond = 4648.8
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11555566 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0581s; samplesPerSecond = 4305.4
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14157520 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0544s; samplesPerSecond = 4595.2
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18558350 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0541s; samplesPerSecond = 4622.4
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15083594 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0540s; samplesPerSecond = 4632.9
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12831787 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0541s; samplesPerSecond = 4624.1
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17656494 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0545s; samplesPerSecond = 4587.6
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14956396 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0625s; samplesPerSecond = 4000.3
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11451660 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0496s; samplesPerSecond = 5040.3
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16392383 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0496s; samplesPerSecond = 5036.0
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14811230 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0505s; samplesPerSecond = 4955.0
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16003760 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0588s; samplesPerSecond = 4255.2
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17969775 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0482s; samplesPerSecond = 5185.4
-05/03/2016 15:21:26: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15964808 * 10000; EvalClassificationError = 0.07750000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=2.49695s
+05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18214960 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0604s; samplesPerSecond = 4138.7
+05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13526825 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0622s; samplesPerSecond = 4020.6
+05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14344995 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0640s; samplesPerSecond = 3906.0
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.12557471 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0628s; samplesPerSecond = 3978.7
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17627924 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0639s; samplesPerSecond = 3914.6
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17585291 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0644s; samplesPerSecond = 3884.2
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.14716791 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0628s; samplesPerSecond = 3979.1
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16757751 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0643s; samplesPerSecond = 3885.5
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10314917 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0642s; samplesPerSecond = 3895.3
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20322217 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0650s; samplesPerSecond = 3848.0
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16604797 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0642s; samplesPerSecond = 3892.3
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.15105725 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0651s; samplesPerSecond = 3839.4
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19206934 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0640s; samplesPerSecond = 3903.9
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13667065 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.2688s; samplesPerSecond = 930.0
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.20713037 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0472s; samplesPerSecond = 5299.3
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.12862158 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0625s; samplesPerSecond = 3998.5
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17174683 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0465s; samplesPerSecond = 5381.7
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16493628 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0526s; samplesPerSecond = 4753.8
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14843726 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0505s; samplesPerSecond = 4952.5
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12574292 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0505s; samplesPerSecond = 4951.4
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13455151 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0614s; samplesPerSecond = 4072.8
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16762988 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0495s; samplesPerSecond = 5055.0
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22347461 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0523s; samplesPerSecond = 4780.1
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18213623 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0542s; samplesPerSecond = 4611.6
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.19970923 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0539s; samplesPerSecond = 4638.8
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22695947 * 250; EvalErrorPrediction = 0.12800000 * 250; time = 0.0542s; samplesPerSecond = 4609.7
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12664502 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0541s; samplesPerSecond = 4625.3
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15838037 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0538s; samplesPerSecond = 4648.8
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11555566 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0581s; samplesPerSecond = 4305.4
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14157520 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0544s; samplesPerSecond = 4595.2
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18558350 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0541s; samplesPerSecond = 4622.4
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15083594 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0540s; samplesPerSecond = 4632.9
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12831787 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0541s; samplesPerSecond = 4624.1
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17656494 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0545s; samplesPerSecond = 4587.6
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14956396 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0625s; samplesPerSecond = 4000.3
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11451660 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0496s; samplesPerSecond = 5040.3
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16392383 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0496s; samplesPerSecond = 5036.0
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14811230 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0505s; samplesPerSecond = 4955.0
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16003760 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0588s; samplesPerSecond = 4255.2
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17969775 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0482s; samplesPerSecond = 5185.4
+05/03/2016 15:21:26: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15964808 * 10000; EvalErrorPrediction = 0.07750000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=2.49695s
 05/03/2016 15:21:26: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_cpu/Models/simple.dnn'
 05/03/2016 15:21:26: CNTKCommandTrainEnd: Simple_Demo_Train

@ -608,7 +608,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -637,7 +637,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -661,7 +661,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0x2e83eb8: {[W2 Value[2 x 50]] }
 0x2e87ac8: {[MVNormalizedFeatures Value[2 x *1]] }
 0x2e87e78: {[W0*features Value[50 x *1]] }
@ -676,7 +676,7 @@ Memory Sharing Structure:
 0x2e8d298: {[B2 Value[2 x 1]] }
 0x2e8f2c8: {[labels Value[2 x *1]] }
 0x2e8f8e8: {[MeanOfFeatures Value[2]] }
-0x2e91598: {[EvalClassificationError Value[1]] }
+0x2e91598: {[EvalErrorPrediction Value[1]] }
 0x2e916f8: {[CrossEntropyWithSoftmax Value[1]] }
 0x2e91bb8: {[LogOfPrior Value[2]] }
 0x2e93758: {[B0 Value[50 x 1]] }
@ -686,7 +686,7 @@ Memory Sharing Structure:
 0x2e985f8: {[W1 Value[50 x 50]] }
 0x2e99178: {[features Value[2 x *1]] }

-05/03/2016 15:21:26: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05970149 * 603; CrossEntropyWithSoftmax = 0.13085309 * 603; perplexity = 1.13980032
+05/03/2016 15:21:26: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05970149 * 603; CrossEntropyWithSoftmax = 0.13085309 * 603; perplexity = 1.13980032

 05/03/2016 15:21:26: Action "test" complete.

@ -702,7 +702,7 @@ Post-processing network...

 8 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -732,7 +732,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *2] -> [2 x 1 x *2]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *2], [2 x 1] -> [2 x 1 x *2]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *2] -> [2 x 1 x *2]
 Validating --> Prior = Mean (labels) : [2 x *2] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -755,7 +755,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalClassificationError Gradient[1]] [EvalClassificationError Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalErrorPrediction Gradient[1]] [EvalErrorPrediction Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
 0x2e82858: {[PosteriorProb Value[2 x 1 x *2]] }
 0x2e83b58: {[labels Value[2 x *2]] }
 0x2e84318: {[MeanOfFeatures Value[2]] }
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.linux.gpu.txt
@ -58,7 +58,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -157,7 +157,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -300,7 +300,7 @@ configparameters: Simple.cntk:Simple_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -356,7 +356,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -385,7 +385,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -409,14 +409,14 @@ Post-processing network complete.

 05/03/2016 15:21:27: Evaluation criterion node(s):

-05/03/2016 15:21:27: 	EvalClassificationError = ClassificationError
+05/03/2016 15:21:27: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+(nil): {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 0x1ef9338: {[features Value[2 x *]] }
 0x2b32ad8: {[MeanOfFeatures Value[2]] }
 0x2b32fe8: {[InvStdOfFeatures Value[2]] }
@ -429,7 +429,7 @@ Memory Sharing Structure:
 0x3185898: {[Prior Value[2]] }
 0x3186bd8: {[LogOfPrior Value[2]] }
 0x318b378: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
-0x318b498: {[EvalClassificationError Value[1]] }
+0x318b498: {[EvalErrorPrediction Value[1]] }
 0x318b798: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 0x318b8f8: {[CrossEntropyWithSoftmax Value[1]] }
 0x3191148: {[B0 Value[50 x 1]] }
@ -459,139 +459,139 @@ Memory Sharing Structure:
 05/03/2016 15:21:28: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:28: Starting minibatch loop.
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70004456 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0055s; samplesPerSecond = 45495.9
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70309900 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54347.8
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70606104 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54241.7
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69845532 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54549.4
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73496533 * 250; EvalClassificationError = 0.57600000 * 250; time = 0.0046s; samplesPerSecond = 54136.0
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72522827 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0046s; samplesPerSecond = 54359.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73287500 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70135547 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54872.7
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72466504 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 54194.7
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72187500 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0046s; samplesPerSecond = 54501.9
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69799023 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54788.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70696387 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0046s; samplesPerSecond = 54371.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69863965 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54300.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71772461 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0046s; samplesPerSecond = 54644.8
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69526270 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54525.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71436426 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70399316 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0046s; samplesPerSecond = 54573.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71745508 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 54716.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71963184 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70689941 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54336.0
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70425098 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54692.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70622754 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69729492 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75974219 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54680.7
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70631250 * 250; EvalClassificationError = 0.43600000 * 250; time = 0.0046s; samplesPerSecond = 54288.8
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70705664 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72660352 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71369727 * 250; EvalClassificationError = 0.55600000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68916602 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0046s; samplesPerSecond = 54371.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69964844 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0046s; samplesPerSecond = 54218.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.69387891 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.68885742 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0046s; samplesPerSecond = 54573.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69388867 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54454.4
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.70363867 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65449219 * 250; EvalClassificationError = 0.44400000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64607031 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 54347.8
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.59492969 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.53965820 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54609.0
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.43681445 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0046s; samplesPerSecond = 54525.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37407422 * 250; EvalClassificationError = 0.12000000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
-05/03/2016 15:21:28: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68409629 * 10000; EvalClassificationError = 0.45780000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.1879s
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70004456 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0055s; samplesPerSecond = 45495.9
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70309900 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54347.8
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70606104 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54241.7
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69845532 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54549.4
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73496533 * 250; EvalErrorPrediction = 0.57600000 * 250; time = 0.0046s; samplesPerSecond = 54136.0
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72522827 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0046s; samplesPerSecond = 54359.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73287500 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70135547 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54872.7
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72466504 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 54194.7
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72187500 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0046s; samplesPerSecond = 54501.9
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69799023 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54788.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70696387 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0046s; samplesPerSecond = 54371.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69863965 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54300.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71772461 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0046s; samplesPerSecond = 54644.8
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69526270 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54525.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71436426 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70399316 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0046s; samplesPerSecond = 54573.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71745508 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 54716.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71963184 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70689941 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54336.0
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70425098 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54692.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70622754 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69729492 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75974219 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54680.7
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70631250 * 250; EvalErrorPrediction = 0.43600000 * 250; time = 0.0046s; samplesPerSecond = 54288.8
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70705664 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72660352 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71369727 * 250; EvalErrorPrediction = 0.55600000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68916602 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0046s; samplesPerSecond = 54371.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69964844 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0046s; samplesPerSecond = 54218.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.69387891 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.68885742 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0046s; samplesPerSecond = 54573.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69388867 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54454.4
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.70363867 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65449219 * 250; EvalErrorPrediction = 0.44400000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64607031 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 54347.8
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.59492969 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.53965820 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54609.0
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.43681445 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0046s; samplesPerSecond = 54525.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37407422 * 250; EvalErrorPrediction = 0.12000000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
+05/03/2016 15:21:28: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68409629 * 10000; EvalErrorPrediction = 0.45780000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.1879s
 05/03/2016 15:21:28: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_gpu/Models/simple.dnn.1'

 05/03/2016 15:21:28: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:28: Starting minibatch loop.
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.27895840 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0046s; samplesPerSecond = 53902.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.24395615 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54933.0
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.19587115 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16368213 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 55126.8
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.19700140 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54933.0
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.19580530 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54585.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.18257983 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17520911 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54752.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20164514 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0046s; samplesPerSecond = 54752.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19787024 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.13437573 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0045s; samplesPerSecond = 55090.3
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.19004956 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54848.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.12287280 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 54957.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16975903 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55175.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16102686 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54513.7
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18611646 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18469507 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 55334.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18472339 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20064648 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54597.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13324683 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13878418 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15587354 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54920.9
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15337378 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54812.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14797070 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 55199.8
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12512891 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0046s; samplesPerSecond = 54383.3
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14058545 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12611963 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18970605 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17965479 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18866455 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0046s; samplesPerSecond = 54836.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17539941 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14742432 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54848.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13789502 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0046s; samplesPerSecond = 54788.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.13652100 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0045s; samplesPerSecond = 55224.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13619336 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54920.9
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.14909424 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54478.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14762256 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 55139.0
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13142578 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54860.7
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19570459 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15718604 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55005.5
-05/03/2016 15:21:28: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16901047 * 10000; EvalClassificationError = 0.07510000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.184798s
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.27895840 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0046s; samplesPerSecond = 53902.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.24395615 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54933.0
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.19587115 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16368213 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 55126.8
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.19700140 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54933.0
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.19580530 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54585.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.18257983 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17520911 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54752.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20164514 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0046s; samplesPerSecond = 54752.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19787024 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.13437573 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0045s; samplesPerSecond = 55090.3
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.19004956 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54848.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.12287280 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 54957.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16975903 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55175.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16102686 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54513.7
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18611646 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18469507 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 55334.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18472339 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20064648 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54597.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13324683 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13878418 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15587354 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54920.9
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15337378 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54812.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14797070 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 55199.8
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12512891 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0046s; samplesPerSecond = 54383.3
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14058545 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12611963 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18970605 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17965479 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18866455 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0046s; samplesPerSecond = 54836.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17539941 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14742432 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54848.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13789502 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0046s; samplesPerSecond = 54788.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.13652100 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0045s; samplesPerSecond = 55224.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13619336 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54920.9
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.14909424 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54478.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14762256 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 55139.0
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13142578 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54860.7
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19570459 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15718604 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55005.5
+05/03/2016 15:21:28: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16901047 * 10000; EvalErrorPrediction = 0.07510000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.184798s
 05/03/2016 15:21:28: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_gpu/Models/simple.dnn.2'

 05/03/2016 15:21:28: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 15:21:28: Starting minibatch loop.
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18133401 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54124.3
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13605756 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14345651 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54668.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.12512610 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17690991 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17504150 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54740.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.14723834 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 55224.2
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16752893 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10317773 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20306372 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16637036 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 55066.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.15126868 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19167224 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13687085 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 55420.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.20709912 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0046s; samplesPerSecond = 54740.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.12918774 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0045s; samplesPerSecond = 54981.3
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17185107 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 55322.0
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16523242 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14880249 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54728.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12590967 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 54957.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13443018 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54872.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16726147 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54836.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22407422 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0045s; samplesPerSecond = 55041.8
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18191553 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.19983057 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54680.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22728223 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 54692.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12720459 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0045s; samplesPerSecond = 55151.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15842871 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11558691 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14163428 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18560596 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15099561 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12822461 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54395.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17662500 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0045s; samplesPerSecond = 55309.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14950781 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0046s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11450977 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16386768 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0045s; samplesPerSecond = 55260.8
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14811523 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 54981.3
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16021143 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17989551 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0045s; samplesPerSecond = 55151.1
-05/03/2016 15:21:28: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15971016 * 10000; EvalClassificationError = 0.07740000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.184406s
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18133401 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54124.3
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13605756 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14345651 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54668.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.12512610 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17690991 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17504150 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54740.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.14723834 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 55224.2
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16752893 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10317773 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20306372 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16637036 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 55066.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.15126868 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19167224 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13687085 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 55420.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.20709912 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0046s; samplesPerSecond = 54740.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.12918774 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0045s; samplesPerSecond = 54981.3
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17185107 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 55322.0
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16523242 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14880249 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54728.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12590967 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 54957.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13443018 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54872.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16726147 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54836.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22407422 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0045s; samplesPerSecond = 55041.8
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18191553 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.19983057 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54680.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22728223 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 54692.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12720459 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0045s; samplesPerSecond = 55151.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15842871 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11558691 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14163428 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18560596 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15099561 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12822461 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54395.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17662500 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0045s; samplesPerSecond = 55309.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14950781 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0046s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11450977 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16386768 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0045s; samplesPerSecond = 55260.8
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14811523 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 54981.3
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16021143 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17989551 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0045s; samplesPerSecond = 55151.1
+05/03/2016 15:21:28: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15971016 * 10000; EvalErrorPrediction = 0.07740000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.184406s
 05/03/2016 15:21:28: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_gpu/Models/simple.dnn'
 05/03/2016 15:21:29: CNTKCommandTrainEnd: Simple_Demo_Train

@ -609,7 +609,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -638,7 +638,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -662,11 +662,11 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0x1efcc08: {[B2 Value[2 x 1]] }
 0x1efd8c8: {[W0 Value[50 x 2]] }
 0x1efee68: {[InvStdOfFeatures Value[2]] }
-0x2b337e8: {[EvalClassificationError Value[1]] }
+0x2b337e8: {[EvalErrorPrediction Value[1]] }
 0x2b33948: {[CrossEntropyWithSoftmax Value[1]] }
 0x2b33f08: {[LogOfPrior Value[2]] }
 0x31808e8: {[W2 Value[2 x 50]] }
@ -687,7 +687,7 @@ Memory Sharing Structure:
 0x7273058: {[W2*H1 Value[2 x 1 x *1]] }
 0x7273218: {[HLast Value[2 x 1 x *1]] }

-05/03/2016 15:21:29: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05970149 * 603; CrossEntropyWithSoftmax = 0.13093129 * 603; perplexity = 1.13988946
+05/03/2016 15:21:29: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05970149 * 603; CrossEntropyWithSoftmax = 0.13093129 * 603; perplexity = 1.13988946

 05/03/2016 15:21:29: Action "test" complete.

@ -703,7 +703,7 @@ Post-processing network...

 8 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -733,7 +733,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *2] -> [2 x 1 x *2]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *2], [2 x 1] -> [2 x 1 x *2]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *2] -> [2 x 1 x *2]
 Validating --> Prior = Mean (labels) : [2 x *2] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -756,7 +756,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalClassificationError Gradient[1]] [EvalClassificationError Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalErrorPrediction Gradient[1]] [EvalErrorPrediction Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
 0x1efcef8: {[features Value[2 x *2]] }
 0x1efe2c8: {[labels Value[2 x *2]] }
 0x1eff188: {[PosteriorProb Value[2 x 1 x *2]] }
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.windows.cpu.txt
@ -56,7 +56,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -155,7 +155,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -298,7 +298,7 @@ configparameters: Simple.cntk:Simple_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -353,7 +353,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -382,7 +382,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -406,14 +406,14 @@ Post-processing network complete.

 05/03/2016 13:12:46: Evaluation criterion node(s):

-05/03/2016 13:12:46: 	EvalClassificationError = ClassificationError
+05/03/2016 13:12:46: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+0000000000000000: {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 000000702B410E90: {[features Value[2 x *]] }
 000000702B44E0C0: {[W0 Value[50 x 2]] }
 000000702B4D76F0: {[H2 Value[50 x 1 x *]] [W1*H1 Gradient[50 x 1 x *]] }
@ -428,7 +428,7 @@ Memory Sharing Structure:
 000000702B4D8690: {[B0 Gradient[50 x 1]] [H1 Gradient[50 x 1 x *]] [W1*H1+B1 Gradient[50 x 1 x *]] [W2*H1 Value[2 x 1 x *]] }
 000000702B4D8730: {[HLast Value[2 x 1 x *]] [W2 Gradient[2 x 50]] }
 000000702B4D89B0: {[CrossEntropyWithSoftmax Value[1]] }
-000000702B4D8AF0: {[EvalClassificationError Value[1]] }
+000000702B4D8AF0: {[EvalErrorPrediction Value[1]] }
 000000702B4D8B90: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
 000000702B4D8F50: {[B2 Gradient[2 x 1]] }
 000000702B4D91D0: {[ScaledLogLikelihood Value[2 x 1 x *]] }
@ -456,139 +456,139 @@ Memory Sharing Structure:
 05/03/2016 13:12:47: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 13:12:47: Starting minibatch loop.
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70511987 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0327s; samplesPerSecond = 7657.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69754895 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9726.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71056921 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0248s; samplesPerSecond = 10096.1
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72951074 * 250; EvalClassificationError = 0.56000000 * 250; time = 0.0245s; samplesPerSecond = 10210.3
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70946655 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0249s; samplesPerSecond = 10032.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72656787 * 250; EvalClassificationError = 0.54400000 * 250; time = 0.0248s; samplesPerSecond = 10065.2
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69337402 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0256s; samplesPerSecond = 9766.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73605176 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0259s; samplesPerSecond = 9662.6
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71453076 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0239s; samplesPerSecond = 10469.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75191992 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0255s; samplesPerSecond = 9802.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75975146 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0248s; samplesPerSecond = 10100.6
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73172168 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0255s; samplesPerSecond = 9808.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76840820 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0261s; samplesPerSecond = 9593.2
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70464746 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0255s; samplesPerSecond = 9807.4
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70557227 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0242s; samplesPerSecond = 10340.4
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72711816 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0249s; samplesPerSecond = 10049.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70076660 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0247s; samplesPerSecond = 10117.4
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69409766 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0254s; samplesPerSecond = 9834.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69139941 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0243s; samplesPerSecond = 10275.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73361621 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0255s; samplesPerSecond = 9802.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72225879 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0246s; samplesPerSecond = 10146.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70356348 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0243s; samplesPerSecond = 10286.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69928613 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0252s; samplesPerSecond = 9909.2
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72360938 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0244s; samplesPerSecond = 10227.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69871875 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0244s; samplesPerSecond = 10243.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69114844 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0248s; samplesPerSecond = 10081.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68648047 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0254s; samplesPerSecond = 9844.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69657227 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0258s; samplesPerSecond = 9679.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71585547 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0255s; samplesPerSecond = 9798.2
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69730664 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0260s; samplesPerSecond = 9609.1
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70432422 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0265s; samplesPerSecond = 9431.1
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.69991797 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9722.7
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.68696875 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0259s; samplesPerSecond = 9647.3
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.67331445 * 250; EvalClassificationError = 0.37200000 * 250; time = 0.0267s; samplesPerSecond = 9364.7
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65711328 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0258s; samplesPerSecond = 9700.1
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64534375 * 250; EvalClassificationError = 0.44800000 * 250; time = 0.0260s; samplesPerSecond = 9608.0
-05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.61021875 * 250; EvalClassificationError = 0.36400000 * 250; time = 0.0263s; samplesPerSecond = 9515.5
-05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.54191016 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0229s; samplesPerSecond = 10907.5
-05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.45624414 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0239s; samplesPerSecond = 10479.5
-05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37636133 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0229s; samplesPerSecond = 10917.0
-05/03/2016 13:12:48: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68695688 * 10000; EvalClassificationError = 0.45550000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=1.01718s
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70511987 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0327s; samplesPerSecond = 7657.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69754895 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9726.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71056921 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0248s; samplesPerSecond = 10096.1
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72951074 * 250; EvalErrorPrediction = 0.56000000 * 250; time = 0.0245s; samplesPerSecond = 10210.3
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70946655 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0249s; samplesPerSecond = 10032.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72656787 * 250; EvalErrorPrediction = 0.54400000 * 250; time = 0.0248s; samplesPerSecond = 10065.2
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69337402 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0256s; samplesPerSecond = 9766.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73605176 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0259s; samplesPerSecond = 9662.6
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71453076 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0239s; samplesPerSecond = 10469.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75191992 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0255s; samplesPerSecond = 9802.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75975146 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0248s; samplesPerSecond = 10100.6
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73172168 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0255s; samplesPerSecond = 9808.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76840820 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0261s; samplesPerSecond = 9593.2
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70464746 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0255s; samplesPerSecond = 9807.4
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70557227 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0242s; samplesPerSecond = 10340.4
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72711816 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0249s; samplesPerSecond = 10049.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70076660 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0247s; samplesPerSecond = 10117.4
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69409766 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0254s; samplesPerSecond = 9834.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69139941 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0243s; samplesPerSecond = 10275.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73361621 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0255s; samplesPerSecond = 9802.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72225879 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0246s; samplesPerSecond = 10146.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70356348 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0243s; samplesPerSecond = 10286.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69928613 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0252s; samplesPerSecond = 9909.2
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72360938 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0244s; samplesPerSecond = 10227.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69871875 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0244s; samplesPerSecond = 10243.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69114844 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0248s; samplesPerSecond = 10081.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68648047 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0254s; samplesPerSecond = 9844.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69657227 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0258s; samplesPerSecond = 9679.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71585547 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0255s; samplesPerSecond = 9798.2
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69730664 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0260s; samplesPerSecond = 9609.1
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70432422 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0265s; samplesPerSecond = 9431.1
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.69991797 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9722.7
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.68696875 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0259s; samplesPerSecond = 9647.3
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.67331445 * 250; EvalErrorPrediction = 0.37200000 * 250; time = 0.0267s; samplesPerSecond = 9364.7
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65711328 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0258s; samplesPerSecond = 9700.1
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64534375 * 250; EvalErrorPrediction = 0.44800000 * 250; time = 0.0260s; samplesPerSecond = 9608.0
+05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.61021875 * 250; EvalErrorPrediction = 0.36400000 * 250; time = 0.0263s; samplesPerSecond = 9515.5
+05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.54191016 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0229s; samplesPerSecond = 10907.5
+05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.45624414 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0239s; samplesPerSecond = 10479.5
+05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37636133 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0229s; samplesPerSecond = 10917.0
+05/03/2016 13:12:48: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68695688 * 10000; EvalErrorPrediction = 0.45550000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=1.01718s
 05/03/2016 13:12:48: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503141245.787579\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_cpu/Models/simple.dnn.1'

 05/03/2016 13:12:48: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 13:12:48: Starting minibatch loop.
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.28579105 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0228s; samplesPerSecond = 10943.3
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.27768619 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0230s; samplesPerSecond = 10860.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.23309790 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0223s; samplesPerSecond = 11187.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.20937585 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0221s; samplesPerSecond = 11327.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.20192059 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0225s; samplesPerSecond = 11116.5
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.21303992 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0232s; samplesPerSecond = 10762.9
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.17823340 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0247s; samplesPerSecond = 10120.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.18892688 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0231s; samplesPerSecond = 10816.4
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.14161328 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0225s; samplesPerSecond = 11100.8
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.15813574 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0226s; samplesPerSecond = 11077.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.21082446 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0233s; samplesPerSecond = 10728.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.16117041 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0229s; samplesPerSecond = 10928.0
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15665234 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0223s; samplesPerSecond = 11195.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13067773 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0226s; samplesPerSecond = 11047.3
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16602710 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0212s; samplesPerSecond = 11796.9
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.14975708 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0215s; samplesPerSecond = 11641.4
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22351709 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0214s; samplesPerSecond = 11708.5
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18010474 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0207s; samplesPerSecond = 12085.5
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15341577 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0207s; samplesPerSecond = 12072.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17195337 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0209s; samplesPerSecond = 11976.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15546069 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0217s; samplesPerSecond = 11534.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16008325 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0214s; samplesPerSecond = 11689.3
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15944043 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0209s; samplesPerSecond = 11981.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15336865 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0207s; samplesPerSecond = 12102.4
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14822266 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0212s; samplesPerSecond = 11766.4
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14999512 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0211s; samplesPerSecond = 11833.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15481982 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0208s; samplesPerSecond = 11992.7
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17656738 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0204s; samplesPerSecond = 12229.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22373242 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0213s; samplesPerSecond = 11738.7
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16403760 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0211s; samplesPerSecond = 11856.8
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17322168 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0211s; samplesPerSecond = 11868.0
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13165430 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0205s; samplesPerSecond = 12202.3
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14016992 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0208s; samplesPerSecond = 11993.9
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18369678 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0214s; samplesPerSecond = 11657.7
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15161035 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0215s; samplesPerSecond = 11612.8
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.18919824 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0215s; samplesPerSecond = 11632.8
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17373975 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0212s; samplesPerSecond = 11818.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15033740 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0208s; samplesPerSecond = 12036.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12107568 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0207s; samplesPerSecond = 12075.5
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15386328 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0227s; samplesPerSecond = 10997.7
-05/03/2016 13:12:48: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.17515541 * 10000; EvalClassificationError = 0.07440000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.87149s
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.28579105 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0228s; samplesPerSecond = 10943.3
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.27768619 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0230s; samplesPerSecond = 10860.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.23309790 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0223s; samplesPerSecond = 11187.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.20937585 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0221s; samplesPerSecond = 11327.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.20192059 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0225s; samplesPerSecond = 11116.5
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.21303992 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0232s; samplesPerSecond = 10762.9
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.17823340 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0247s; samplesPerSecond = 10120.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.18892688 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0231s; samplesPerSecond = 10816.4
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.14161328 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0225s; samplesPerSecond = 11100.8
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.15813574 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0226s; samplesPerSecond = 11077.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.21082446 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0233s; samplesPerSecond = 10728.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.16117041 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0229s; samplesPerSecond = 10928.0
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15665234 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0223s; samplesPerSecond = 11195.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13067773 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0226s; samplesPerSecond = 11047.3
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16602710 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0212s; samplesPerSecond = 11796.9
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.14975708 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0215s; samplesPerSecond = 11641.4
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22351709 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0214s; samplesPerSecond = 11708.5
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18010474 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0207s; samplesPerSecond = 12085.5
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15341577 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0207s; samplesPerSecond = 12072.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17195337 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0209s; samplesPerSecond = 11976.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15546069 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0217s; samplesPerSecond = 11534.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16008325 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0214s; samplesPerSecond = 11689.3
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15944043 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0209s; samplesPerSecond = 11981.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15336865 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0207s; samplesPerSecond = 12102.4
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14822266 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0212s; samplesPerSecond = 11766.4
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14999512 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0211s; samplesPerSecond = 11833.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15481982 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0208s; samplesPerSecond = 11992.7
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17656738 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0204s; samplesPerSecond = 12229.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22373242 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0213s; samplesPerSecond = 11738.7
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16403760 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0211s; samplesPerSecond = 11856.8
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17322168 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0211s; samplesPerSecond = 11868.0
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13165430 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0205s; samplesPerSecond = 12202.3
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14016992 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0208s; samplesPerSecond = 11993.9
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18369678 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0214s; samplesPerSecond = 11657.7
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15161035 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0215s; samplesPerSecond = 11612.8
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.18919824 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0215s; samplesPerSecond = 11632.8
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17373975 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0212s; samplesPerSecond = 11818.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15033740 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0208s; samplesPerSecond = 12036.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12107568 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0207s; samplesPerSecond = 12075.5
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15386328 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0227s; samplesPerSecond = 10997.7
+05/03/2016 13:12:48: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.17515541 * 10000; EvalErrorPrediction = 0.07440000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.87149s
 05/03/2016 13:12:48: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503141245.787579\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_cpu/Models/simple.dnn.2'

 05/03/2016 13:12:48: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 13:12:48: Starting minibatch loop.
-05/03/2016 13:12:48:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10671188 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0217s; samplesPerSecond = 11511.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17609265 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0205s; samplesPerSecond = 12183.8
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14152701 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0208s; samplesPerSecond = 12001.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16348053 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0213s; samplesPerSecond = 11748.1
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11764551 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0219s; samplesPerSecond = 11435.4
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16246954 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0212s; samplesPerSecond = 11811.4
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16140149 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0207s; samplesPerSecond = 12078.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19747632 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0202s; samplesPerSecond = 12391.0
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20041309 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0214s; samplesPerSecond = 11659.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13657080 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0208s; samplesPerSecond = 12033.7
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20124377 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0203s; samplesPerSecond = 12293.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17898120 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0206s; samplesPerSecond = 12144.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16037830 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0232s; samplesPerSecond = 10779.1
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16276050 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0214s; samplesPerSecond = 11704.7
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19882275 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0218s; samplesPerSecond = 11454.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10263354 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0208s; samplesPerSecond = 12041.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17038770 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0213s; samplesPerSecond = 11725.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16624731 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0209s; samplesPerSecond = 11958.3
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12664160 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0213s; samplesPerSecond = 11723.3
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11944995 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0213s; samplesPerSecond = 11733.8
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12949756 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0208s; samplesPerSecond = 11996.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18147778 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0222s; samplesPerSecond = 11242.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13172412 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0233s; samplesPerSecond = 10719.0
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19600269 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0238s; samplesPerSecond = 10521.0
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15840479 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0226s; samplesPerSecond = 11084.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11888281 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0225s; samplesPerSecond = 11129.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13710742 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0222s; samplesPerSecond = 11251.1
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20026318 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0233s; samplesPerSecond = 10730.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.18824951 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0223s; samplesPerSecond = 11227.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16653223 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0225s; samplesPerSecond = 11096.3
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.11935254 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0229s; samplesPerSecond = 10918.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16085400 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0225s; samplesPerSecond = 11132.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.16112646 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0219s; samplesPerSecond = 11439.6
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12345313 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0229s; samplesPerSecond = 10904.6
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13502686 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0226s; samplesPerSecond = 11075.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20874756 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0224s; samplesPerSecond = 11185.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16650537 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0227s; samplesPerSecond = 11009.3
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14995752 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0206s; samplesPerSecond = 12134.7
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16497070 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0209s; samplesPerSecond = 11953.7
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16843018 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0210s; samplesPerSecond = 11912.1
-05/03/2016 13:12:49: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15794755 * 10000; EvalClassificationError = 0.07480000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.871499s
+05/03/2016 13:12:48:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10671188 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0217s; samplesPerSecond = 11511.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17609265 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0205s; samplesPerSecond = 12183.8
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14152701 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0208s; samplesPerSecond = 12001.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16348053 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0213s; samplesPerSecond = 11748.1
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11764551 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0219s; samplesPerSecond = 11435.4
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16246954 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0212s; samplesPerSecond = 11811.4
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16140149 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0207s; samplesPerSecond = 12078.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19747632 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0202s; samplesPerSecond = 12391.0
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20041309 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0214s; samplesPerSecond = 11659.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13657080 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0208s; samplesPerSecond = 12033.7
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20124377 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0203s; samplesPerSecond = 12293.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17898120 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0206s; samplesPerSecond = 12144.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16037830 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0232s; samplesPerSecond = 10779.1
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16276050 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0214s; samplesPerSecond = 11704.7
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19882275 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0218s; samplesPerSecond = 11454.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10263354 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0208s; samplesPerSecond = 12041.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17038770 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0213s; samplesPerSecond = 11725.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16624731 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0209s; samplesPerSecond = 11958.3
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12664160 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0213s; samplesPerSecond = 11723.3
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11944995 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0213s; samplesPerSecond = 11733.8
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12949756 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0208s; samplesPerSecond = 11996.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18147778 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0222s; samplesPerSecond = 11242.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13172412 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0233s; samplesPerSecond = 10719.0
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19600269 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0238s; samplesPerSecond = 10521.0
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15840479 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0226s; samplesPerSecond = 11084.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11888281 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0225s; samplesPerSecond = 11129.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13710742 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0222s; samplesPerSecond = 11251.1
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20026318 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0233s; samplesPerSecond = 10730.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.18824951 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0223s; samplesPerSecond = 11227.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16653223 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0225s; samplesPerSecond = 11096.3
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.11935254 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0229s; samplesPerSecond = 10918.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16085400 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0225s; samplesPerSecond = 11132.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.16112646 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0219s; samplesPerSecond = 11439.6
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12345313 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0229s; samplesPerSecond = 10904.6
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13502686 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0226s; samplesPerSecond = 11075.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20874756 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0224s; samplesPerSecond = 11185.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16650537 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0227s; samplesPerSecond = 11009.3
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14995752 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0206s; samplesPerSecond = 12134.7
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16497070 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0209s; samplesPerSecond = 11953.7
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16843018 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0210s; samplesPerSecond = 11912.1
+05/03/2016 13:12:49: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15794755 * 10000; EvalErrorPrediction = 0.07480000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.871499s
 05/03/2016 13:12:49: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503141245.787579\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_cpu/Models/simple.dnn'
 05/03/2016 13:12:49: CNTKCommandTrainEnd: Simple_Demo_Train

@ -606,7 +606,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -635,7 +635,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -659,7 +659,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 00000070343C5200: {[InvStdOfFeatures Value[2]] }
 00000070343C5340: {[Prior Value[2]] }
 00000070343C53E0: {[W0 Value[50 x 2]] }
@ -671,7 +671,7 @@ Memory Sharing Structure:
 000000703442D030: {[HLast Value[2 x 1 x *1]] }
 000000703442D0D0: {[W0*features Value[50 x *1]] }
 000000703442D170: {[W1*H1+B1 Value[50 x 1 x *1]] }
-000000703442D2B0: {[EvalClassificationError Value[1]] }
+000000703442D2B0: {[EvalErrorPrediction Value[1]] }
 000000703442D530: {[CrossEntropyWithSoftmax Value[1]] }
 000000703442D5D0: {[W2 Value[2 x 50]] }
 000000703442D670: {[LogOfPrior Value[2]] }
@ -684,7 +684,7 @@ Memory Sharing Structure:
 0000007034432340: {[B0 Value[50 x 1]] }
 0000007034432480: {[B2 Value[2 x 1]] }

-05/03/2016 13:12:50: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12474995 * 603; perplexity = 1.13286515
+05/03/2016 13:12:50: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12474995 * 603; perplexity = 1.13286515

 05/03/2016 13:12:50: Action "test" complete.

@ -700,7 +700,7 @@ Post-processing network...

 8 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -730,7 +730,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *2] -> [2 x 1 x *2]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *2], [2 x 1] -> [2 x 1 x *2]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *2] -> [2 x 1 x *2]
 Validating --> Prior = Mean (labels) : [2 x *2] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -753,7 +753,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalClassificationError Gradient[1]] [EvalClassificationError Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalErrorPrediction Gradient[1]] [EvalErrorPrediction Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
 000000702E3275E0: {[H2 Value[50 x 1 x *2]] }
 000000702E327680: {[W2*H1 Value[2 x 1 x *2]] }
 000000702E3277C0: {[LogOfPrior Value[2]] }
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.windows.gpu.txt
@ -56,7 +56,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -155,7 +155,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -298,7 +298,7 @@ configparameters: Simple.cntk:Simple_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -354,7 +354,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -383,7 +383,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -407,14 +407,14 @@ Post-processing network complete.

 05/03/2016 13:01:59: Evaluation criterion node(s):

-05/03/2016 13:01:59: 	EvalClassificationError = ClassificationError
+05/03/2016 13:01:59: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+0000000000000000: {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 000000501A590FF0: {[W2 Value[2 x 50]] }
 000000501A591090: {[W0 Value[50 x 2]] }
 000000501A5919F0: {[B1 Value[50 x 1]] }
@ -427,7 +427,7 @@ Memory Sharing Structure:
 000000501A5A1180: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 000000501A5A1220: {[B0 Gradient[50 x 1]] [H1 Gradient[50 x 1 x *]] [W1*H1+B1 Gradient[50 x 1 x *]] [W2*H1 Value[2 x 1 x *]] }
 000000501A5A17C0: {[W0 Gradient[50 x 2]] [W0*features+B0 Value[50 x 1 x *]] }
-000000501A5A1900: {[EvalClassificationError Value[1]] }
+000000501A5A1900: {[EvalErrorPrediction Value[1]] }
 000000501A5A19A0: {[W0*features Value[50 x *]] }
 000000501A5A1A40: {[W2*H1 Gradient[2 x 1 x *]] }
 000000501A5A1F40: {[MVNormalizedFeatures Value[2 x *]] }
@ -457,139 +457,139 @@ Memory Sharing Structure:
 05/03/2016 13:01:59: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 13:01:59: Starting minibatch loop.
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70650452 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0123s; samplesPerSecond = 20247.8
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69701831 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0112s; samplesPerSecond = 22393.4
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71089587 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0126s; samplesPerSecond = 19907.6
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72980273 * 250; EvalClassificationError = 0.56000000 * 250; time = 0.0113s; samplesPerSecond = 22042.0
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70902783 * 250; EvalClassificationError = 0.52800000 * 250; time = 0.0131s; samplesPerSecond = 19088.3
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72657300 * 250; EvalClassificationError = 0.54400000 * 250; time = 0.0138s; samplesPerSecond = 18059.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69319678 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0148s; samplesPerSecond = 16917.0
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73563477 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0164s; samplesPerSecond = 15236.5
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71463281 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0123s; samplesPerSecond = 20321.9
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75213428 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0167s; samplesPerSecond = 14944.1
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75931445 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0131s; samplesPerSecond = 19105.8
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73075293 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0132s; samplesPerSecond = 18886.5
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76701953 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0128s; samplesPerSecond = 19574.1
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70451270 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0128s; samplesPerSecond = 19467.4
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70539941 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0143s; samplesPerSecond = 17444.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72700293 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0123s; samplesPerSecond = 20391.5
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70096191 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0143s; samplesPerSecond = 17465.4
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69437305 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0117s; samplesPerSecond = 21367.5
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69161621 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0137s; samplesPerSecond = 18200.3
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73388281 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0115s; samplesPerSecond = 21782.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72255664 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0127s; samplesPerSecond = 19745.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70414551 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0131s; samplesPerSecond = 19017.2
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69976758 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0137s; samplesPerSecond = 18191.1
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72419141 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0143s; samplesPerSecond = 17444.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69943945 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0109s; samplesPerSecond = 22891.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69206445 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0133s; samplesPerSecond = 18739.2
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68771680 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0130s; samplesPerSecond = 19291.6
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69878516 * 250; EvalClassificationError = 0.44000000 * 250; time = 0.0130s; samplesPerSecond = 19230.8
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71889844 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0118s; samplesPerSecond = 21168.5
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.70086523 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0128s; samplesPerSecond = 19577.1
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70878320 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0129s; samplesPerSecond = 19432.6
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.70674414 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0126s; samplesPerSecond = 19767.5
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69707422 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0121s; samplesPerSecond = 20736.6
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68588281 * 250; EvalClassificationError = 0.40800000 * 250; time = 0.0124s; samplesPerSecond = 20109.4
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.67734766 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0127s; samplesPerSecond = 19727.0
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.67958008 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0127s; samplesPerSecond = 19615.5
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.66424805 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0117s; samplesPerSecond = 21292.9
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.62412500 * 250; EvalClassificationError = 0.20400000 * 250; time = 0.0127s; samplesPerSecond = 19624.8
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.58007422 * 250; EvalClassificationError = 0.16000000 * 250; time = 0.0130s; samplesPerSecond = 19157.1
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.52764648 * 250; EvalClassificationError = 0.19200000 * 250; time = 0.0143s; samplesPerSecond = 17521.7
-05/03/2016 13:02:00: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.69975483 * 10000; EvalClassificationError = 0.46850000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.526194s
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70650452 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0123s; samplesPerSecond = 20247.8
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69701831 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0112s; samplesPerSecond = 22393.4
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71089587 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0126s; samplesPerSecond = 19907.6
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72980273 * 250; EvalErrorPrediction = 0.56000000 * 250; time = 0.0113s; samplesPerSecond = 22042.0
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70902783 * 250; EvalErrorPrediction = 0.52800000 * 250; time = 0.0131s; samplesPerSecond = 19088.3
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72657300 * 250; EvalErrorPrediction = 0.54400000 * 250; time = 0.0138s; samplesPerSecond = 18059.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69319678 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0148s; samplesPerSecond = 16917.0
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73563477 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0164s; samplesPerSecond = 15236.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71463281 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0123s; samplesPerSecond = 20321.9
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75213428 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0167s; samplesPerSecond = 14944.1
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75931445 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0131s; samplesPerSecond = 19105.8
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73075293 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0132s; samplesPerSecond = 18886.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76701953 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0128s; samplesPerSecond = 19574.1
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70451270 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0128s; samplesPerSecond = 19467.4
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70539941 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0143s; samplesPerSecond = 17444.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72700293 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0123s; samplesPerSecond = 20391.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70096191 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0143s; samplesPerSecond = 17465.4
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69437305 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0117s; samplesPerSecond = 21367.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69161621 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0137s; samplesPerSecond = 18200.3
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73388281 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0115s; samplesPerSecond = 21782.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72255664 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0127s; samplesPerSecond = 19745.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70414551 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0131s; samplesPerSecond = 19017.2
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69976758 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0137s; samplesPerSecond = 18191.1
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72419141 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0143s; samplesPerSecond = 17444.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69943945 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0109s; samplesPerSecond = 22891.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69206445 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0133s; samplesPerSecond = 18739.2
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68771680 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0130s; samplesPerSecond = 19291.6
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69878516 * 250; EvalErrorPrediction = 0.44000000 * 250; time = 0.0130s; samplesPerSecond = 19230.8
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71889844 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0118s; samplesPerSecond = 21168.5
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.70086523 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0128s; samplesPerSecond = 19577.1
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70878320 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0129s; samplesPerSecond = 19432.6
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.70674414 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0126s; samplesPerSecond = 19767.5
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69707422 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0121s; samplesPerSecond = 20736.6
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68588281 * 250; EvalErrorPrediction = 0.40800000 * 250; time = 0.0124s; samplesPerSecond = 20109.4
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.67734766 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0127s; samplesPerSecond = 19727.0
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.67958008 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0127s; samplesPerSecond = 19615.5
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.66424805 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0117s; samplesPerSecond = 21292.9
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.62412500 * 250; EvalErrorPrediction = 0.20400000 * 250; time = 0.0127s; samplesPerSecond = 19624.8
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.58007422 * 250; EvalErrorPrediction = 0.16000000 * 250; time = 0.0130s; samplesPerSecond = 19157.1
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.52764648 * 250; EvalErrorPrediction = 0.19200000 * 250; time = 0.0143s; samplesPerSecond = 17521.7
+05/03/2016 13:02:00: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.69975483 * 10000; EvalErrorPrediction = 0.46850000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.526194s
 05/03/2016 13:02:00: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503140157.802427\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_gpu/Models/simple.dnn.1'

 05/03/2016 13:02:00: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 13:02:00: Starting minibatch loop.
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.44832977 * 250; EvalClassificationError = 0.15200000 * 250; time = 0.0124s; samplesPerSecond = 20205.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.40085291 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0142s; samplesPerSecond = 17631.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.33487201 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0129s; samplesPerSecond = 19405.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.29081885 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0125s; samplesPerSecond = 20016.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.26279236 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0118s; samplesPerSecond = 21188.2
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.25220630 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0138s; samplesPerSecond = 18158.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.20988293 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0129s; samplesPerSecond = 19447.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.21577441 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0148s; samplesPerSecond = 16846.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.16622900 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0157s; samplesPerSecond = 15967.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.17637866 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0144s; samplesPerSecond = 17315.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.22185278 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0123s; samplesPerSecond = 20366.6
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17055811 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0151s; samplesPerSecond = 16564.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16481055 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0140s; samplesPerSecond = 17910.9
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13871704 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0156s; samplesPerSecond = 16005.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16922363 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0143s; samplesPerSecond = 17454.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15403345 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0135s; samplesPerSecond = 18485.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22255859 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0108s; samplesPerSecond = 23079.8
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18146851 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0133s; samplesPerSecond = 18843.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15611523 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0131s; samplesPerSecond = 19081.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17320215 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0137s; samplesPerSecond = 18192.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15727930 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0117s; samplesPerSecond = 21404.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16195410 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0119s; samplesPerSecond = 21088.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.16121338 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0128s; samplesPerSecond = 19546.5
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15427100 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0125s; samplesPerSecond = 20011.2
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14844775 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0141s; samplesPerSecond = 17743.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15055713 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0108s; samplesPerSecond = 23067.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15467627 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0132s; samplesPerSecond = 18965.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17615869 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0140s; samplesPerSecond = 17872.5
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22356104 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0121s; samplesPerSecond = 20650.9
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16514209 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0109s; samplesPerSecond = 22946.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17355859 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0129s; samplesPerSecond = 19372.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13117578 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0138s; samplesPerSecond = 18151.5
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13956104 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0121s; samplesPerSecond = 20743.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18397363 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0105s; samplesPerSecond = 23741.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15222656 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0126s; samplesPerSecond = 19909.2
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.18856396 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0145s; samplesPerSecond = 17207.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17513330 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0130s; samplesPerSecond = 19199.8
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15008252 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0108s; samplesPerSecond = 23043.6
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12125342 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0127s; samplesPerSecond = 19668.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15408496 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0141s; samplesPerSecond = 17788.5
-05/03/2016 13:02:00: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.19333879 * 10000; EvalClassificationError = 0.07700000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.525411s
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.44832977 * 250; EvalErrorPrediction = 0.15200000 * 250; time = 0.0124s; samplesPerSecond = 20205.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.40085291 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0142s; samplesPerSecond = 17631.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.33487201 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0129s; samplesPerSecond = 19405.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.29081885 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0125s; samplesPerSecond = 20016.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.26279236 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0118s; samplesPerSecond = 21188.2
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.25220630 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0138s; samplesPerSecond = 18158.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.20988293 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0129s; samplesPerSecond = 19447.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.21577441 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0148s; samplesPerSecond = 16846.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.16622900 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0157s; samplesPerSecond = 15967.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.17637866 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0144s; samplesPerSecond = 17315.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.22185278 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0123s; samplesPerSecond = 20366.6
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17055811 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0151s; samplesPerSecond = 16564.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16481055 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0140s; samplesPerSecond = 17910.9
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13871704 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0156s; samplesPerSecond = 16005.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16922363 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0143s; samplesPerSecond = 17454.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15403345 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0135s; samplesPerSecond = 18485.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22255859 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0108s; samplesPerSecond = 23079.8
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18146851 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0133s; samplesPerSecond = 18843.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15611523 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0131s; samplesPerSecond = 19081.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17320215 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0137s; samplesPerSecond = 18192.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15727930 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0117s; samplesPerSecond = 21404.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16195410 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0119s; samplesPerSecond = 21088.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.16121338 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0128s; samplesPerSecond = 19546.5
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15427100 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0125s; samplesPerSecond = 20011.2
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14844775 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0141s; samplesPerSecond = 17743.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15055713 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0108s; samplesPerSecond = 23067.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15467627 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0132s; samplesPerSecond = 18965.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17615869 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0140s; samplesPerSecond = 17872.5
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22356104 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0121s; samplesPerSecond = 20650.9
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16514209 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0109s; samplesPerSecond = 22946.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17355859 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0129s; samplesPerSecond = 19372.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13117578 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0138s; samplesPerSecond = 18151.5
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13956104 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0121s; samplesPerSecond = 20743.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18397363 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0105s; samplesPerSecond = 23741.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15222656 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0126s; samplesPerSecond = 19909.2
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.18856396 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0145s; samplesPerSecond = 17207.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17513330 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0130s; samplesPerSecond = 19199.8
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15008252 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0108s; samplesPerSecond = 23043.6
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12125342 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0127s; samplesPerSecond = 19668.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15408496 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0141s; samplesPerSecond = 17788.5
+05/03/2016 13:02:00: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.19333879 * 10000; EvalErrorPrediction = 0.07700000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.525411s
 05/03/2016 13:02:00: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503140157.802427\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_gpu/Models/simple.dnn.2'

 05/03/2016 13:02:00: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples

 05/03/2016 13:02:00: Starting minibatch loop.
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10746781 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0126s; samplesPerSecond = 19806.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17648278 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0122s; samplesPerSecond = 20429.8
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14106094 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0126s; samplesPerSecond = 19838.1
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16348077 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0127s; samplesPerSecond = 19745.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11767151 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0110s; samplesPerSecond = 22787.3
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16217944 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0137s; samplesPerSecond = 18292.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16171204 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0147s; samplesPerSecond = 16977.9
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19844067 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0130s; samplesPerSecond = 19285.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19984509 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0116s; samplesPerSecond = 21585.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13727051 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0133s; samplesPerSecond = 18839.5
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20126648 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0150s; samplesPerSecond = 16709.0
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17913672 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0138s; samplesPerSecond = 18066.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15983582 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0138s; samplesPerSecond = 18131.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16260010 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0126s; samplesPerSecond = 19798.8
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19813428 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0122s; samplesPerSecond = 20453.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10295117 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0124s; samplesPerSecond = 20091.6
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17117065 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0127s; samplesPerSecond = 19762.8
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16661938 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0127s; samplesPerSecond = 19620.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12718042 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0108s; samplesPerSecond = 23156.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11923853 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0139s; samplesPerSecond = 17989.5
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12890332 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0129s; samplesPerSecond = 19340.9
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18205469 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0124s; samplesPerSecond = 20182.4
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13154199 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0111s; samplesPerSecond = 22599.9
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19668359 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0139s; samplesPerSecond = 17922.4
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15817578 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0126s; samplesPerSecond = 19915.6
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11871240 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0136s; samplesPerSecond = 18378.3
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13730908 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0107s; samplesPerSecond = 23384.2
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20024854 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0134s; samplesPerSecond = 18719.6
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.18850244 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0131s; samplesPerSecond = 19151.2
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16640479 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0108s; samplesPerSecond = 23086.2
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.11872168 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0107s; samplesPerSecond = 23347.0
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16090430 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0127s; samplesPerSecond = 19730.1
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.16162939 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0137s; samplesPerSecond = 18205.7
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12408594 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0109s; samplesPerSecond = 22839.4
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13544434 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0126s; samplesPerSecond = 19893.4
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20890771 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0129s; samplesPerSecond = 19366.3
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16674365 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0146s; samplesPerSecond = 17116.3
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15033398 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0131s; samplesPerSecond = 19152.7
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16547705 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0120s; samplesPerSecond = 20752.1
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16792480 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0129s; samplesPerSecond = 19450.7
-05/03/2016 13:02:01: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15806136 * 10000; EvalClassificationError = 0.07470000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.511151s
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10746781 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0126s; samplesPerSecond = 19806.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17648278 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0122s; samplesPerSecond = 20429.8
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14106094 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0126s; samplesPerSecond = 19838.1
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16348077 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0127s; samplesPerSecond = 19745.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11767151 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0110s; samplesPerSecond = 22787.3
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16217944 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0137s; samplesPerSecond = 18292.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16171204 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0147s; samplesPerSecond = 16977.9
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19844067 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0130s; samplesPerSecond = 19285.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19984509 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0116s; samplesPerSecond = 21585.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13727051 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0133s; samplesPerSecond = 18839.5
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20126648 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0150s; samplesPerSecond = 16709.0
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17913672 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0138s; samplesPerSecond = 18066.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15983582 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0138s; samplesPerSecond = 18131.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16260010 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0126s; samplesPerSecond = 19798.8
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19813428 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0122s; samplesPerSecond = 20453.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10295117 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0124s; samplesPerSecond = 20091.6
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17117065 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0127s; samplesPerSecond = 19762.8
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16661938 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0127s; samplesPerSecond = 19620.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12718042 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0108s; samplesPerSecond = 23156.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11923853 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0139s; samplesPerSecond = 17989.5
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12890332 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0129s; samplesPerSecond = 19340.9
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18205469 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0124s; samplesPerSecond = 20182.4
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13154199 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0111s; samplesPerSecond = 22599.9
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19668359 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0139s; samplesPerSecond = 17922.4
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15817578 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0126s; samplesPerSecond = 19915.6
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11871240 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0136s; samplesPerSecond = 18378.3
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13730908 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0107s; samplesPerSecond = 23384.2
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20024854 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0134s; samplesPerSecond = 18719.6
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.18850244 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0131s; samplesPerSecond = 19151.2
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16640479 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0108s; samplesPerSecond = 23086.2
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.11872168 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0107s; samplesPerSecond = 23347.0
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16090430 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0127s; samplesPerSecond = 19730.1
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.16162939 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0137s; samplesPerSecond = 18205.7
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12408594 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0109s; samplesPerSecond = 22839.4
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13544434 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0126s; samplesPerSecond = 19893.4
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20890771 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0129s; samplesPerSecond = 19366.3
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16674365 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0146s; samplesPerSecond = 17116.3
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15033398 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0131s; samplesPerSecond = 19152.7
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16547705 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0120s; samplesPerSecond = 20752.1
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16792480 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0129s; samplesPerSecond = 19450.7
+05/03/2016 13:02:01: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15806136 * 10000; EvalErrorPrediction = 0.07470000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.511151s
 05/03/2016 13:02:01: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503140157.802427\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_gpu/Models/simple.dnn'
 05/03/2016 13:02:01: CNTKCommandTrainEnd: Simple_Demo_Train

@ -607,7 +607,7 @@ Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -636,7 +636,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -660,7 +660,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 000000501A591090: {[W0*features+B0 Value[50 x 1 x *1]] }
 000000501A591130: {[W1*H1 Value[50 x 1 x *1]] }
 000000501A5916D0: {[W1*H1+B1 Value[50 x 1 x *1]] }
@ -672,7 +672,7 @@ Memory Sharing Structure:
 000000501A592850: {[LogOfPrior Value[2]] }
 000000501A5928F0: {[H2 Value[50 x 1 x *1]] }
 000000501A592B70: {[W2 Value[2 x 50]] }
-000000501A592D50: {[EvalClassificationError Value[1]] }
+000000501A592D50: {[EvalErrorPrediction Value[1]] }
 000000501A592DF0: {[CrossEntropyWithSoftmax Value[1]] }
 0000005024E60C70: {[W1 Value[50 x 50]] }
 0000005024E613F0: {[W0 Value[50 x 2]] }
@ -685,7 +685,7 @@ Memory Sharing Structure:
 0000005024E62430: {[features Value[2 x *1]] }
 0000005024E624D0: {[B1 Value[50 x 1]] }

-05/03/2016 13:02:01: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12740351 * 603; perplexity = 1.13587526
+05/03/2016 13:02:01: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12740351 * 603; perplexity = 1.13587526

 05/03/2016 13:02:01: Action "test" complete.

@ -701,7 +701,7 @@ Post-processing network...

 8 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -731,7 +731,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *2] -> [2 x 1 x *2]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *2], [2 x 1] -> [2 x 1 x *2]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *2] -> [2 x 1 x *2]
 Validating --> Prior = Mean (labels) : [2 x *2] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -754,7 +754,7 @@ Allocating matrices for forward and/or backward propagation.

 Memory Sharing Structure:

-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalClassificationError Gradient[1]] [EvalClassificationError Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalErrorPrediction Gradient[1]] [EvalErrorPrediction Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
 000000501A5914F0: {[InvStdOfFeatures Value[2]] }
 000000501A591590: {[MeanOfFeatures Value[2]] }
 000000501A5916D0: {[labels Value[2 x *2]] }
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.cpu.txt
@ -0,0 +1,434 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 264172964 kB
+-------------------------------------------------------------------
+=== Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config/FeedForward.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 16 2016 09:41:57
+		Last modified date: Mon Aug 15 23:39:17 2016
+		Build type: release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: /usr/local/cuda-7.5
+		CUB_PATH: /usr/local/cub-1.4.1
+		CUDNN_PATH: /usr/local/cudnn-4.0
+		Build Branch: HEAD
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by philly on 643085f7f8c2
+		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+-------------------------------------------------------------------
+Changed current directory to /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+MPIWrapper: initializing MPI
+ping [requestnodes (before change)]: 1 nodes pinging each other
+ping [requestnodes (before change)]: all 1 nodes responded
+requestnodes [MPIWrapper]: using 1 out of 1 MPI nodes (1 requested); we (0) are in (participating)
+ping [requestnodes (after change)]: 1 nodes pinging each other
+ping [requestnodes (after change)]: all 1 nodes responded
+mpihelper: only one MPI process: MPI operation will be boring
+ping [mpihelper]: 1 nodes pinging each other
+ping [mpihelper]: all 1 nodes responded
+08/16/2016 10:01:41: -------------------------------------------------------------------
+08/16/2016 10:01:41: Build info: 
+
+08/16/2016 10:01:41: 		Built time: Aug 16 2016 09:41:57
+08/16/2016 10:01:41: 		Last modified date: Mon Aug 15 23:39:17 2016
+08/16/2016 10:01:41: 		Build type: release
+08/16/2016 10:01:41: 		Build target: GPU
+08/16/2016 10:01:41: 		With 1bit-SGD: yes
+08/16/2016 10:01:41: 		Math lib: mkl
+08/16/2016 10:01:41: 		CUDA_PATH: /usr/local/cuda-7.5
+08/16/2016 10:01:41: 		CUB_PATH: /usr/local/cub-1.4.1
+08/16/2016 10:01:41: 		CUDNN_PATH: /usr/local/cudnn-4.0
+08/16/2016 10:01:41: 		Build Branch: HEAD
+08/16/2016 10:01:41: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 10:01:41: 		Built by philly on 643085f7f8c2
+08/16/2016 10:01:41: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+08/16/2016 10:01:41: -------------------------------------------------------------------
+08/16/2016 10:01:42: -------------------------------------------------------------------
+08/16/2016 10:01:42: GPU info:
+
+08/16/2016 10:01:42: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:42: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:42: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:42: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:42: -------------------------------------------------------------------
+
+08/16/2016 10:01:42: Running on localhost at 2016/08/16 10:01:42
+08/16/2016 10:01:42: Command line: 
+/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config/FeedForward.cntk  currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu  DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config  OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]  speechTrain=[reader=[useMersenneTwisterRand=true]]
+
+
+
+08/16/2016 10:01:42: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:01:42: RootDir = ".."
+ConfigDir = "$RootDir$/Config"
+DataDir = "$RootDir$/Data"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = "1"
+modelPath = "$ModelDir$/cntkSpeechFF.dnn"
+parallelTrain = true
+speechTrain = [
+    action = "train"
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 256:1024:2048
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        parallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = true
+            dataParallelSGD = [
+                gradientBits = 1
+            ]
+        ]
+        autoAdjust=[
+            autoAdjustMinibatch = true
+            minibatchSizeTuningFrequency = 1
+            minibatchSearchCriterionErrorMargin = 2
+        ]
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [ 
+            dim = 363
+            type = "real"
+            scpFile = "$DataDir$/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
+DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
+OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
+DeviceId=-1
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=2048]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+
+08/16/2016 10:01:42: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 10:01:42: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:01:42: RootDir = ".."
+ConfigDir = "../Config"
+DataDir = "../Data"
+OutputDir = "../Output"
+ModelDir = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = "1"
+modelPath = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn"
+parallelTrain = true
+speechTrain = [
+    action = "train"
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 256:1024:2048
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        parallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = true
+            dataParallelSGD = [
+                gradientBits = 1
+            ]
+        ]
+        autoAdjust=[
+            autoAdjustMinibatch = true
+            minibatchSizeTuningFrequency = 1
+            minibatchSearchCriterionErrorMargin = 2
+        ]
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [ 
+            dim = 363
+            type = "real"
+            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
+            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
+DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
+OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
+DeviceId=-1
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=2048]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+
+08/16/2016 10:01:42: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 10:01:42: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: FeedForward.cntk:command=speechTrain
+configparameters: FeedForward.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
+configparameters: FeedForward.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+configparameters: FeedForward.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+configparameters: FeedForward.cntk:deviceId=-1
+configparameters: FeedForward.cntk:ModelDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models
+configparameters: FeedForward.cntk:modelPath=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
+configparameters: FeedForward.cntk:OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
+configparameters: FeedForward.cntk:parallelTrain=true
+configparameters: FeedForward.cntk:precision=float
+configparameters: FeedForward.cntk:RootDir=..
+configparameters: FeedForward.cntk:RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
+configparameters: FeedForward.cntk:speechTrain=[
+    action = "train"
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 256:1024:2048
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        parallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = true
+            dataParallelSGD = [
+                gradientBits = 1
+            ]
+        ]
+        autoAdjust=[
+            autoAdjustMinibatch = true
+            minibatchSizeTuningFrequency = 1
+            minibatchSearchCriterionErrorMargin = 2
+        ]
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [ 
+            dim = 363
+            type = "real"
+            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
+            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]] [reader=[useMersenneTwisterRand=true]]
+
+configparameters: FeedForward.cntk:timestamping=true
+configparameters: FeedForward.cntk:traceLevel=1
+08/16/2016 10:01:42: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:01:42: Commands: speechTrain
+08/16/2016 10:01:42: Precision = "float"
+08/16/2016 10:01:42: CNTKModelPath: /tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
+08/16/2016 10:01:42: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 10:01:42: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+
+08/16/2016 10:01:42: ##############################################################################
+08/16/2016 10:01:42: #                                                                            #
+08/16/2016 10:01:42: # Action "train"                                                             #
+08/16/2016 10:01:42: #                                                                            #
+08/16/2016 10:01:42: ##############################################################################
+
+08/16/2016 10:01:42: CNTKCommandTrainBegin: speechTrain
+SimpleNetworkBuilder Using CPU
+reading script file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp ... 948 entries
+total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list
+htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+
+08/16/2016 10:01:42: Creating virgin network.
+Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- 0.000000.
+Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
+Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- 0.000000.
+Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
+Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- 0.000000.
+Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
+Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
+	EvalErrorPrediction = ErrorPrediction()
+	InvStdOfFeatures = InvStdDev()
+	MeanOfFeatures = Mean()
+	PosteriorProb = Softmax()
+	Prior = Mean()
+	ScaledLogLikelihood = Minus()
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 x *]
+Validating --> W2 = LearnableParameter() :  -> [132 x 512]
+Validating --> W1 = LearnableParameter() :  -> [512 x 512]
+Validating --> W0 = LearnableParameter() :  -> [512 x 363]
+Validating --> features = InputValue() :  -> [363 x *]
+Validating --> MeanOfFeatures = Mean (features) : [363 x *] -> [363]
+Validating --> InvStdOfFeatures = InvStdDev (features) : [363 x *] -> [363]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization (features, MeanOfFeatures, InvStdOfFeatures) : [363 x *], [363], [363] -> [363 x *]
+Validating --> W0*features = Times (W0, MVNormalizedFeatures) : [512 x 363], [363 x *] -> [512 x *]
+Validating --> B0 = LearnableParameter() :  -> [512 x 1]
+Validating --> W0*features+B0 = Plus (W0*features, B0) : [512 x *], [512 x 1] -> [512 x 1 x *]
+Validating --> H1 = Sigmoid (W0*features+B0) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> W1*H1 = Times (W1, H1) : [512 x 512], [512 x 1 x *] -> [512 x 1 x *]
+Validating --> B1 = LearnableParameter() :  -> [512 x 1]
+Validating --> W1*H1+B1 = Plus (W1*H1, B1) : [512 x 1 x *], [512 x 1] -> [512 x 1 x *]
+Validating --> H2 = Sigmoid (W1*H1+B1) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> W2*H1 = Times (W2, H2) : [132 x 512], [512 x 1 x *] -> [132 x 1 x *]
+Validating --> B2 = LearnableParameter() :  -> [132 x 1]
+Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 x *], [132 x 1] -> [132 x 1 x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 x *] -> [132 x 1 x *]
+Validating --> Prior = Mean (labels) : [132 x *] -> [132]
+Validating --> LogOfPrior = Log (Prior) : [132] -> [132]
+Validating --> ScaledLogLikelihood = Minus (HLast, LogOfPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/16/2016 10:01:42: Created model with 25 nodes on CPU.
+
+08/16/2016 10:01:42: Training criterion node(s):
+08/16/2016 10:01:42: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+
+08/16/2016 10:01:42: Evaluation criterion node(s):
+08/16/2016 10:01:42: 	EvalErrorPrediction = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 40 matrices, 19 are shared as 8, and 21 are not shared.
+
+	{ W1 : [512 x 512] (gradient)
+	  W1*H1+B1 : [512 x 1 x *] }
+	{ H2 : [512 x 1 x *]
+	  W1*H1 : [512 x 1 x *] (gradient) }
+	{ B0 : [512 x 1] (gradient)
+	  H1 : [512 x 1 x *] (gradient)
+	  W1*H1+B1 : [512 x 1 x *] (gradient)
+	  W2*H1 : [132 x 1 x *] }
+	{ HLast : [132 x 1 x *]
+	  W2 : [132 x 512] (gradient) }
+	{ B1 : [512 x 1] (gradient)
+	  H2 : [512 x 1 x *] (gradient)
+	  HLast : [132 x 1 x *] (gradient) }
+	{ W0 : [512 x 363] (gradient)
+	  W0*features+B0 : [512 x 1 x *] }
+	{ H1 : [512 x 1 x *]
+	  W0*features : [512 x *] (gradient) }
+	{ W0*features+B0 : [512 x 1 x *] (gradient)
+	  W1*H1 : [512 x 1 x *] }
+
+
+08/16/2016 10:01:42: Training 516740 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:
+
+08/16/2016 10:01:42: 	Node 'B0' (LearnableParameter operation) : [512 x 1]
+08/16/2016 10:01:42: 	Node 'B1' (LearnableParameter operation) : [512 x 1]
+08/16/2016 10:01:42: 	Node 'B2' (LearnableParameter operation) : [132 x 1]
+08/16/2016 10:01:42: 	Node 'W0' (LearnableParameter operation) : [512 x 363]
+08/16/2016 10:01:42: 	Node 'W1' (LearnableParameter operation) : [512 x 512]
+08/16/2016 10:01:42: 	Node 'W2' (LearnableParameter operation) : [132 x 512]
+
+
+08/16/2016 10:01:42: Precomputing --> 3 PreCompute nodes found.
+
+08/16/2016 10:01:42: 	MeanOfFeatures = Mean()
+08/16/2016 10:01:42: 	InvStdOfFeatures = InvStdDev()
+08/16/2016 10:01:42: 	Prior = Mean()
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+08/16/2016 10:01:43: Precomputing --> Completed.
+
+
+08/16/2016 10:01:43: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
+minibatchiterator: epoch 0: frames [0..2048] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+08/16/2016 10:01:43: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
+08/16/2016 10:01:44: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.45117986 * 2048; EvalErrorPrediction = 0.92187500 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.209966s
+08/16/2016 10:01:44: SGD: Saving checkpoint model '/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn'
+08/16/2016 10:01:44: CNTKCommandTrainEnd: speechTrain
+
+08/16/2016 10:01:44: Action "train" complete.
+
+08/16/2016 10:01:44: __COMPLETED__
+~MPIWrapper
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.debug.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.debug.cpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.debug.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.debug.gpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.gpu.txt
@ -0,0 +1,435 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 264172964 kB
+-------------------------------------------------------------------
+=== Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config/FeedForward.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 16 2016 09:41:57
+		Last modified date: Mon Aug 15 23:39:17 2016
+		Build type: release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: /usr/local/cuda-7.5
+		CUB_PATH: /usr/local/cub-1.4.1
+		CUDNN_PATH: /usr/local/cudnn-4.0
+		Build Branch: HEAD
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by philly on 643085f7f8c2
+		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+-------------------------------------------------------------------
+Changed current directory to /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+MPIWrapper: initializing MPI
+ping [requestnodes (before change)]: 1 nodes pinging each other
+ping [requestnodes (before change)]: all 1 nodes responded
+requestnodes [MPIWrapper]: using 1 out of 1 MPI nodes (1 requested); we (0) are in (participating)
+ping [requestnodes (after change)]: 1 nodes pinging each other
+ping [requestnodes (after change)]: all 1 nodes responded
+mpihelper: only one MPI process: MPI operation will be boring
+ping [mpihelper]: 1 nodes pinging each other
+ping [mpihelper]: all 1 nodes responded
+08/16/2016 10:01:45: -------------------------------------------------------------------
+08/16/2016 10:01:45: Build info: 
+
+08/16/2016 10:01:45: 		Built time: Aug 16 2016 09:41:57
+08/16/2016 10:01:45: 		Last modified date: Mon Aug 15 23:39:17 2016
+08/16/2016 10:01:45: 		Build type: release
+08/16/2016 10:01:45: 		Build target: GPU
+08/16/2016 10:01:45: 		With 1bit-SGD: yes
+08/16/2016 10:01:45: 		Math lib: mkl
+08/16/2016 10:01:45: 		CUDA_PATH: /usr/local/cuda-7.5
+08/16/2016 10:01:45: 		CUB_PATH: /usr/local/cub-1.4.1
+08/16/2016 10:01:45: 		CUDNN_PATH: /usr/local/cudnn-4.0
+08/16/2016 10:01:45: 		Build Branch: HEAD
+08/16/2016 10:01:45: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 10:01:45: 		Built by philly on 643085f7f8c2
+08/16/2016 10:01:45: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+08/16/2016 10:01:45: -------------------------------------------------------------------
+08/16/2016 10:01:46: -------------------------------------------------------------------
+08/16/2016 10:01:46: GPU info:
+
+08/16/2016 10:01:46: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:46: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:46: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:46: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:46: -------------------------------------------------------------------
+
+08/16/2016 10:01:46: Running on localhost at 2016/08/16 10:01:46
+08/16/2016 10:01:46: Command line: 
+/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config/FeedForward.cntk  currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu  DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config  OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]  speechTrain=[reader=[useMersenneTwisterRand=true]]
+
+
+
+08/16/2016 10:01:46: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:01:46: RootDir = ".."
+ConfigDir = "$RootDir$/Config"
+DataDir = "$RootDir$/Data"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = "1"
+modelPath = "$ModelDir$/cntkSpeechFF.dnn"
+parallelTrain = true
+speechTrain = [
+    action = "train"
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 256:1024:2048
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        parallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = true
+            dataParallelSGD = [
+                gradientBits = 1
+            ]
+        ]
+        autoAdjust=[
+            autoAdjustMinibatch = true
+            minibatchSizeTuningFrequency = 1
+            minibatchSearchCriterionErrorMargin = 2
+        ]
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [ 
+            dim = 363
+            type = "real"
+            scpFile = "$DataDir$/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
+DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
+OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
+DeviceId=0
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=2048]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+
+08/16/2016 10:01:46: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 10:01:46: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:01:46: RootDir = ".."
+ConfigDir = "../Config"
+DataDir = "../Data"
+OutputDir = "../Output"
+ModelDir = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = "1"
+modelPath = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn"
+parallelTrain = true
+speechTrain = [
+    action = "train"
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 256:1024:2048
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        parallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = true
+            dataParallelSGD = [
+                gradientBits = 1
+            ]
+        ]
+        autoAdjust=[
+            autoAdjustMinibatch = true
+            minibatchSizeTuningFrequency = 1
+            minibatchSearchCriterionErrorMargin = 2
+        ]
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [ 
+            dim = 363
+            type = "real"
+            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
+            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
+DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
+OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
+DeviceId=0
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=2048]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+
+08/16/2016 10:01:46: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 10:01:46: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: FeedForward.cntk:command=speechTrain
+configparameters: FeedForward.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
+configparameters: FeedForward.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+configparameters: FeedForward.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+configparameters: FeedForward.cntk:deviceId=0
+configparameters: FeedForward.cntk:ModelDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models
+configparameters: FeedForward.cntk:modelPath=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
+configparameters: FeedForward.cntk:OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
+configparameters: FeedForward.cntk:parallelTrain=true
+configparameters: FeedForward.cntk:precision=float
+configparameters: FeedForward.cntk:RootDir=..
+configparameters: FeedForward.cntk:RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
+configparameters: FeedForward.cntk:speechTrain=[
+    action = "train"
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 256:1024:2048
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        parallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = true
+            dataParallelSGD = [
+                gradientBits = 1
+            ]
+        ]
+        autoAdjust=[
+            autoAdjustMinibatch = true
+            minibatchSizeTuningFrequency = 1
+            minibatchSearchCriterionErrorMargin = 2
+        ]
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [ 
+            dim = 363
+            type = "real"
+            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
+            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]] [reader=[useMersenneTwisterRand=true]]
+
+configparameters: FeedForward.cntk:timestamping=true
+configparameters: FeedForward.cntk:traceLevel=1
+08/16/2016 10:01:46: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:01:46: Commands: speechTrain
+08/16/2016 10:01:46: Precision = "float"
+08/16/2016 10:01:46: CNTKModelPath: /tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
+08/16/2016 10:01:46: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 10:01:46: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+
+08/16/2016 10:01:46: ##############################################################################
+08/16/2016 10:01:46: #                                                                            #
+08/16/2016 10:01:46: # Action "train"                                                             #
+08/16/2016 10:01:46: #                                                                            #
+08/16/2016 10:01:46: ##############################################################################
+
+08/16/2016 10:01:46: CNTKCommandTrainBegin: speechTrain
+SimpleNetworkBuilder Using GPU 0
+reading script file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp ... 948 entries
+total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list
+htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+
+08/16/2016 10:01:46: Creating virgin network.
+Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- 0.000000.
+Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
+SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- 0.000000.
+Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
+Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- 0.000000.
+Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
+Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
+	EvalErrorPrediction = ErrorPrediction()
+	InvStdOfFeatures = InvStdDev()
+	MeanOfFeatures = Mean()
+	PosteriorProb = Softmax()
+	Prior = Mean()
+	ScaledLogLikelihood = Minus()
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 x *]
+Validating --> W2 = LearnableParameter() :  -> [132 x 512]
+Validating --> W1 = LearnableParameter() :  -> [512 x 512]
+Validating --> W0 = LearnableParameter() :  -> [512 x 363]
+Validating --> features = InputValue() :  -> [363 x *]
+Validating --> MeanOfFeatures = Mean (features) : [363 x *] -> [363]
+Validating --> InvStdOfFeatures = InvStdDev (features) : [363 x *] -> [363]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization (features, MeanOfFeatures, InvStdOfFeatures) : [363 x *], [363], [363] -> [363 x *]
+Validating --> W0*features = Times (W0, MVNormalizedFeatures) : [512 x 363], [363 x *] -> [512 x *]
+Validating --> B0 = LearnableParameter() :  -> [512 x 1]
+Validating --> W0*features+B0 = Plus (W0*features, B0) : [512 x *], [512 x 1] -> [512 x 1 x *]
+Validating --> H1 = Sigmoid (W0*features+B0) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> W1*H1 = Times (W1, H1) : [512 x 512], [512 x 1 x *] -> [512 x 1 x *]
+Validating --> B1 = LearnableParameter() :  -> [512 x 1]
+Validating --> W1*H1+B1 = Plus (W1*H1, B1) : [512 x 1 x *], [512 x 1] -> [512 x 1 x *]
+Validating --> H2 = Sigmoid (W1*H1+B1) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> W2*H1 = Times (W2, H2) : [132 x 512], [512 x 1 x *] -> [132 x 1 x *]
+Validating --> B2 = LearnableParameter() :  -> [132 x 1]
+Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 x *], [132 x 1] -> [132 x 1 x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 x *] -> [132 x 1 x *]
+Validating --> Prior = Mean (labels) : [132 x *] -> [132]
+Validating --> LogOfPrior = Log (Prior) : [132] -> [132]
+Validating --> ScaledLogLikelihood = Minus (HLast, LogOfPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/16/2016 10:01:46: Created model with 25 nodes on GPU 0.
+
+08/16/2016 10:01:46: Training criterion node(s):
+08/16/2016 10:01:46: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+
+08/16/2016 10:01:46: Evaluation criterion node(s):
+08/16/2016 10:01:46: 	EvalErrorPrediction = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 40 matrices, 19 are shared as 8, and 21 are not shared.
+
+	{ W0 : [512 x 363] (gradient)
+	  W0*features+B0 : [512 x 1 x *] }
+	{ H1 : [512 x 1 x *]
+	  W0*features : [512 x *] (gradient) }
+	{ W0*features+B0 : [512 x 1 x *] (gradient)
+	  W1*H1 : [512 x 1 x *] }
+	{ W1 : [512 x 512] (gradient)
+	  W1*H1+B1 : [512 x 1 x *] }
+	{ H2 : [512 x 1 x *]
+	  W1*H1 : [512 x 1 x *] (gradient) }
+	{ B0 : [512 x 1] (gradient)
+	  H1 : [512 x 1 x *] (gradient)
+	  W1*H1+B1 : [512 x 1 x *] (gradient)
+	  W2*H1 : [132 x 1 x *] }
+	{ HLast : [132 x 1 x *]
+	  W2 : [132 x 512] (gradient) }
+	{ B1 : [512 x 1] (gradient)
+	  H2 : [512 x 1 x *] (gradient)
+	  HLast : [132 x 1 x *] (gradient) }
+
+
+08/16/2016 10:01:46: Training 516740 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:
+
+08/16/2016 10:01:46: 	Node 'B0' (LearnableParameter operation) : [512 x 1]
+08/16/2016 10:01:46: 	Node 'B1' (LearnableParameter operation) : [512 x 1]
+08/16/2016 10:01:46: 	Node 'B2' (LearnableParameter operation) : [132 x 1]
+08/16/2016 10:01:46: 	Node 'W0' (LearnableParameter operation) : [512 x 363]
+08/16/2016 10:01:46: 	Node 'W1' (LearnableParameter operation) : [512 x 512]
+08/16/2016 10:01:46: 	Node 'W2' (LearnableParameter operation) : [132 x 512]
+
+
+08/16/2016 10:01:46: Precomputing --> 3 PreCompute nodes found.
+
+08/16/2016 10:01:46: 	MeanOfFeatures = Mean()
+08/16/2016 10:01:46: 	InvStdOfFeatures = InvStdDev()
+08/16/2016 10:01:46: 	Prior = Mean()
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+08/16/2016 10:01:46: Precomputing --> Completed.
+
+
+08/16/2016 10:01:46: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
+minibatchiterator: epoch 0: frames [0..2048] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+08/16/2016 10:01:46: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
+08/16/2016 10:01:46: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.41144794 * 2048; EvalErrorPrediction = 0.92773438 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.023072s
+08/16/2016 10:01:46: SGD: Saving checkpoint model '/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn'
+08/16/2016 10:01:46: CNTKCommandTrainEnd: speechTrain
+
+08/16/2016 10:01:46: Action "train" complete.
+
+08/16/2016 10:01:46: __COMPLETED__
+~MPIWrapper
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.release.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.release.cpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.release.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.release.gpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.release.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.release.cpu.txt
@ -1,18 +1,24 @@
-=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]]
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 268381192 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]
 -------------------------------------------------------------------
 Build info: 

-		Built time: May  3 2016 13:15:46
-		Last modified date: Tue Apr 26 23:35:31 2016
+		Built time: Aug 16 2016 03:09:16
+		Last modified date: Fri Aug 12 05:28:23 2016
 		Build type: Release
 		Build target: GPU
-		With 1bit-SGD: no
+		With 1bit-SGD: yes
+		Math lib: mkl
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
 		CUB_PATH: c:\src\cub-1.4.1
 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
 		Build Branch: HEAD
-		Build SHA1: af96f7cce6c3c78a4f1e9315e061291c79360e12
-		Built by svcphil on cntk-muc01
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by svcphil on Philly-Pool1
 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 -------------------------------------------------------------------
 Changed current directory to C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
@ -25,31 +31,39 @@ ping [requestnodes (after change)]: all 1 nodes responded
 mpihelper: only one MPI process: MPI operation will be boring
 ping [mpihelper]: 1 nodes pinging each other
 ping [mpihelper]: all 1 nodes responded
-05/03/2016 13:22:22: -------------------------------------------------------------------
-05/03/2016 13:22:22: Build info: 
+08/16/2016 03:20:10: -------------------------------------------------------------------
+08/16/2016 03:20:10: Build info: 

-05/03/2016 13:22:22: 		Built time: May  3 2016 13:15:46
-05/03/2016 13:22:22: 		Last modified date: Tue Apr 26 23:35:31 2016
-05/03/2016 13:22:22: 		Build type: Release
-05/03/2016 13:22:22: 		Build target: GPU
-05/03/2016 13:22:22: 		With 1bit-SGD: no
-05/03/2016 13:22:22: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
-05/03/2016 13:22:22: 		CUB_PATH: c:\src\cub-1.4.1
-05/03/2016 13:22:22: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
-05/03/2016 13:22:22: 		Build Branch: HEAD
-05/03/2016 13:22:22: 		Build SHA1: af96f7cce6c3c78a4f1e9315e061291c79360e12
-05/03/2016 13:22:22: 		Built by svcphil on cntk-muc01
-05/03/2016 13:22:22: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
-05/03/2016 13:22:22: -------------------------------------------------------------------
+08/16/2016 03:20:10: 		Built time: Aug 16 2016 03:09:16
+08/16/2016 03:20:10: 		Last modified date: Fri Aug 12 05:28:23 2016
+08/16/2016 03:20:10: 		Build type: Release
+08/16/2016 03:20:10: 		Build target: GPU
+08/16/2016 03:20:10: 		With 1bit-SGD: yes
+08/16/2016 03:20:10: 		Math lib: mkl
+08/16/2016 03:20:10: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/16/2016 03:20:10: 		CUB_PATH: c:\src\cub-1.4.1
+08/16/2016 03:20:10: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
+08/16/2016 03:20:10: 		Build Branch: HEAD
+08/16/2016 03:20:10: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 03:20:10: 		Built by svcphil on Philly-Pool1
+08/16/2016 03:20:10: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+08/16/2016 03:20:10: -------------------------------------------------------------------
+08/16/2016 03:20:12: -------------------------------------------------------------------
+08/16/2016 03:20:12: GPU info:

-05/03/2016 13:22:22: Running on DPHAIM-22 at 2016/05/03 13:22:22
-05/03/2016 13:22:22: Command line: 
-C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]
+08/16/2016 03:20:12: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:12: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:12: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:12: -------------------------------------------------------------------
+
+08/16/2016 03:20:12: Running on DPHAIM-25 at 2016/08/16 03:20:12
+08/16/2016 03:20:12: Command line: 
+C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]  speechTrain=[reader=[useMersenneTwisterRand=true]]



-05/03/2016 13:22:22: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-05/03/2016 13:22:22: RootDir = ".."
+08/16/2016 03:20:12: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:12: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
@ -65,7 +79,7 @@ speechTrain = [
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -111,35 +125,36 @@ speechTrain = [
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]

-05/03/2016 13:22:22: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:12: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<

-05/03/2016 13:22:22: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-05/03/2016 13:22:22: RootDir = ".."
+08/16/2016 03:20:12: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:12: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
-ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models"
+ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = "1"
-modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn"
+modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn"
 parallelTrain = true
 speechTrain = [
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -185,36 +200,37 @@ speechTrain = [
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]

-05/03/2016 13:22:22: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:12: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<

-05/03/2016 13:22:22: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:12: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: FeedForward.cntk:command=speechTrain
 configparameters: FeedForward.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 configparameters: FeedForward.cntk:currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: FeedForward.cntk:DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: FeedForward.cntk:deviceId=-1
-configparameters: FeedForward.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models
-configparameters: FeedForward.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
-configparameters: FeedForward.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+configparameters: FeedForward.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models
+configparameters: FeedForward.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
+configparameters: FeedForward.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 configparameters: FeedForward.cntk:parallelTrain=true
 configparameters: FeedForward.cntk:precision=float
 configparameters: FeedForward.cntk:RootDir=..
-configparameters: FeedForward.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+configparameters: FeedForward.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 configparameters: FeedForward.cntk:speechTrain=[
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -258,24 +274,24 @@ configparameters: FeedForward.cntk:speechTrain=[
            labelType = "category"
        ]
    ]
-] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]] [reader=[useMersenneTwisterRand=true]]

 configparameters: FeedForward.cntk:timestamping=true
 configparameters: FeedForward.cntk:traceLevel=1
-05/03/2016 13:22:22: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/03/2016 13:22:22: Commands: speechTrain
-05/03/2016 13:22:22: Precision = "float"
-05/03/2016 13:22:22: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
-05/03/2016 13:22:22: CNTKCommandTrainInfo: speechTrain : 1
-05/03/2016 13:22:22: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+08/16/2016 03:20:12: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:12: Commands: speechTrain
+08/16/2016 03:20:12: Precision = "float"
+08/16/2016 03:20:12: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
+08/16/2016 03:20:12: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 03:20:12: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1

-05/03/2016 13:22:22: ##############################################################################
-05/03/2016 13:22:22: #                                                                            #
-05/03/2016 13:22:22: # Action "train"                                                             #
-05/03/2016 13:22:22: #                                                                            #
-05/03/2016 13:22:22: ##############################################################################
+08/16/2016 03:20:12: ##############################################################################
+08/16/2016 03:20:12: #                                                                            #
+08/16/2016 03:20:12: # Action "train"                                                             #
+08/16/2016 03:20:12: #                                                                            #
+08/16/2016 03:20:12: ##############################################################################

-05/03/2016 13:22:22: CNTKCommandTrainBegin: speechTrain
+08/16/2016 03:20:12: CNTKCommandTrainBegin: speechTrain
 SimpleNetworkBuilder Using CPU
 reading script file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp ... 948 entries
 total 132 state names in state list C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list
@ -284,13 +300,25 @@ htkmlfreader: reading MLF file C:\jenkins\workspace\CNTK-Test-Windows-W1\Example
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames

-05/03/2016 13:22:23: Creating virgin network.
+08/16/2016 03:20:12: Creating virgin network.
+Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- 0.000000.
+Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
+Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- 0.000000.
+Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
+Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- 0.000000.
+Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
+Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.

 Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -319,7 +347,7 @@ Validating --> W2*H1 = Times (W2, H2) : [132 x 512], [512 x 1 x *] -> [132 x 1 x
 Validating --> B2 = LearnableParameter() :  -> [132 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 x *] -> [132 x 1 x *]
 Validating --> Prior = Mean (labels) : [132 x *] -> [132]
 Validating --> LogOfPrior = Log (Prior) : [132] -> [132]
@ -336,70 +364,70 @@ Validating network, final pass.

 Post-processing network complete.

-05/03/2016 13:22:23: Created model with 25 nodes on CPU.
+08/16/2016 03:20:12: Created model with 25 nodes on CPU.

-05/03/2016 13:22:23: Training criterion node(s):
-05/03/2016 13:22:23: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+08/16/2016 03:20:12: Training criterion node(s):
+08/16/2016 03:20:12: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax

-05/03/2016 13:22:23: Evaluation criterion node(s):
-
-05/03/2016 13:22:23: 	EvalClassificationError = ClassificationError
+08/16/2016 03:20:12: Evaluation criterion node(s):
+08/16/2016 03:20:12: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

-Memory Sharing Structure:
+Memory Sharing: Out of 40 matrices, 19 are shared as 8, and 21 are not shared.

-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[363]] [LogOfPrior Gradient[132]] [MVNormalizedFeatures Gradient[363 x *]] [MeanOfFeatures Gradient[363]] [PosteriorProb Gradient[132 x 1 x *]] [PosteriorProb Value[132 x 1 x *]] [Prior Gradient[132]] [ScaledLogLikelihood Gradient[132 x 1 x *]] [features Gradient[363 x *]] [labels Gradient[132 x *]] }
-000000BDD334C430: {[features Value[363 x *]] }
-000000BDD334C4D0: {[W0 Value[512 x 363]] }
-000000BDD334C610: {[MeanOfFeatures Value[363]] }
-000000BDD334C890: {[B0 Value[512 x 1]] }
-000000BDD334CCF0: {[W1 Value[512 x 512]] }
-000000BDD334CE30: {[B1 Value[512 x 1]] }
-000000BDD334D1F0: {[InvStdOfFeatures Value[363]] }
-000000BDD5BCA080: {[Prior Value[132]] }
-000000BDD5BCA120: {[EvalClassificationError Value[1]] }
-000000BDD5BCA260: {[W2 Value[132 x 512]] }
-000000BDD5BCA440: {[labels Value[132 x *]] }
-000000BDD5BCA6C0: {[MVNormalizedFeatures Value[363 x *]] }
-000000BDD5BCAE40: {[B0 Gradient[512 x 1]] [H1 Gradient[512 x 1 x *]] [W1*H1+B1 Gradient[512 x 1 x *]] [W2*H1 Value[132 x 1 x *]] }
-000000BDD5BCAEE0: {[CrossEntropyWithSoftmax Gradient[1]] }
-000000BDD5BCAF80: {[B1 Gradient[512 x 1]] [H2 Gradient[512 x 1 x *]] [HLast Gradient[132 x 1 x *]] }
-000000BDD5BCB0C0: {[H1 Value[512 x 1 x *]] [W0*features Gradient[512 x *]] }
-000000BDD5BCB160: {[ScaledLogLikelihood Value[132 x 1 x *]] }
-000000BDD5BCB340: {[W0 Gradient[512 x 363]] [W0*features+B0 Value[512 x 1 x *]] }
-000000BDD5BCB520: {[W1 Gradient[512 x 512]] [W1*H1+B1 Value[512 x 1 x *]] }
-000000BDD5BCB5C0: {[B2 Gradient[132 x 1]] }
-000000BDD5BCB700: {[W0*features Value[512 x *]] }
-000000BDD5BCB7A0: {[HLast Value[132 x 1 x *]] [W2 Gradient[132 x 512]] }
-000000BDD5BCB8E0: {[LogOfPrior Value[132]] }
-000000BDD5BCB980: {[H2 Value[512 x 1 x *]] [W1*H1 Gradient[512 x 1 x *]] }
-000000BDD5BCBAC0: {[B2 Value[132 x 1]] }
-000000BDD5BCBB60: {[CrossEntropyWithSoftmax Value[1]] }
-000000BDD5BCBC00: {[W0*features+B0 Gradient[512 x 1 x *]] [W1*H1 Value[512 x 1 x *]] }
-000000BDD5BCBCA0: {[W2*H1 Gradient[132 x 1 x *]] }
+	{ W0*features+B0 : [512 x 1 x *] (gradient)
+	  W1*H1 : [512 x 1 x *] }
+	{ W0 : [512 x 363] (gradient)
+	  W0*features+B0 : [512 x 1 x *] }
+	{ H1 : [512 x 1 x *]
+	  W0*features : [512 x *] (gradient) }
+	{ W1 : [512 x 512] (gradient)
+	  W1*H1+B1 : [512 x 1 x *] }
+	{ H2 : [512 x 1 x *]
+	  W1*H1 : [512 x 1 x *] (gradient) }
+	{ HLast : [132 x 1 x *]
+	  W2 : [132 x 512] (gradient) }
+	{ B0 : [512 x 1] (gradient)
+	  H1 : [512 x 1 x *] (gradient)
+	  W1*H1+B1 : [512 x 1 x *] (gradient)
+	  W2*H1 : [132 x 1 x *] }
+	{ B1 : [512 x 1] (gradient)
+	  H2 : [512 x 1 x *] (gradient)
+	  HLast : [132 x 1 x *] (gradient) }


-05/03/2016 13:22:23: Precomputing --> 3 PreCompute nodes found.
+08/16/2016 03:20:12: Training 516740 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:

-05/03/2016 13:22:23: 	MeanOfFeatures = Mean()
-05/03/2016 13:22:23: 	InvStdOfFeatures = InvStdDev()
-05/03/2016 13:22:23: 	Prior = Mean()
+08/16/2016 03:20:12: 	Node 'B0' (LearnableParameter operation) : [512 x 1]
+08/16/2016 03:20:12: 	Node 'B1' (LearnableParameter operation) : [512 x 1]
+08/16/2016 03:20:12: 	Node 'B2' (LearnableParameter operation) : [132 x 1]
+08/16/2016 03:20:12: 	Node 'W0' (LearnableParameter operation) : [512 x 363]
+08/16/2016 03:20:12: 	Node 'W1' (LearnableParameter operation) : [512 x 512]
+08/16/2016 03:20:12: 	Node 'W2' (LearnableParameter operation) : [132 x 512]
+
+
+08/16/2016 03:20:12: Precomputing --> 3 PreCompute nodes found.
+
+08/16/2016 03:20:12: 	MeanOfFeatures = Mean()
+08/16/2016 03:20:12: 	InvStdOfFeatures = InvStdDev()
+08/16/2016 03:20:12: 	Prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms

-05/03/2016 13:22:24: Precomputing --> Completed.
+08/16/2016 03:20:15: Precomputing --> Completed.


-05/03/2016 13:22:24: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
+08/16/2016 03:20:15: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
 minibatchiterator: epoch 0: frames [0..2048] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses

-05/03/2016 13:22:24: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
-05/03/2016 13:22:25: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.48531419 * 2048; EvalClassificationError = 0.90722656 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.288909s
-05/03/2016 13:22:25: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn'
-05/03/2016 13:22:25: CNTKCommandTrainEnd: speechTrain
+08/16/2016 03:20:15: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
+08/16/2016 03:20:15: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.46427900 * 2048; EvalErrorPrediction = 0.91259766 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.28059s
+08/16/2016 03:20:15: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn'
+08/16/2016 03:20:15: CNTKCommandTrainEnd: speechTrain

-05/03/2016 13:22:25: Action "train" complete.
+08/16/2016 03:20:15: Action "train" complete.

-05/03/2016 13:22:25: __COMPLETED__
+08/16/2016 03:20:15: __COMPLETED__
+~MPIWrapper
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.debug.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.debug.cpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.debug.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.debug.gpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.release.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.release.gpu.txt
@ -1,18 +1,24 @@
-=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]]
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 268381192 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]
 -------------------------------------------------------------------
 Build info: 

-		Built time: May  3 2016 13:15:46
-		Last modified date: Tue Apr 26 23:35:31 2016
+		Built time: Aug 16 2016 03:09:16
+		Last modified date: Fri Aug 12 05:28:23 2016
 		Build type: Release
 		Build target: GPU
-		With 1bit-SGD: no
+		With 1bit-SGD: yes
+		Math lib: mkl
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
 		CUB_PATH: c:\src\cub-1.4.1
 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
 		Build Branch: HEAD
-		Build SHA1: af96f7cce6c3c78a4f1e9315e061291c79360e12
-		Built by svcphil on cntk-muc01
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by svcphil on Philly-Pool1
 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 -------------------------------------------------------------------
 Changed current directory to C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
@ -25,31 +31,39 @@ ping [requestnodes (after change)]: all 1 nodes responded
 mpihelper: only one MPI process: MPI operation will be boring
 ping [mpihelper]: 1 nodes pinging each other
 ping [mpihelper]: all 1 nodes responded
-05/03/2016 13:22:25: -------------------------------------------------------------------
-05/03/2016 13:22:25: Build info: 
+08/16/2016 03:20:17: -------------------------------------------------------------------
+08/16/2016 03:20:17: Build info: 

-05/03/2016 13:22:25: 		Built time: May  3 2016 13:15:46
-05/03/2016 13:22:25: 		Last modified date: Tue Apr 26 23:35:31 2016
-05/03/2016 13:22:25: 		Build type: Release
-05/03/2016 13:22:25: 		Build target: GPU
-05/03/2016 13:22:25: 		With 1bit-SGD: no
-05/03/2016 13:22:25: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
-05/03/2016 13:22:25: 		CUB_PATH: c:\src\cub-1.4.1
-05/03/2016 13:22:25: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
-05/03/2016 13:22:25: 		Build Branch: HEAD
-05/03/2016 13:22:25: 		Build SHA1: af96f7cce6c3c78a4f1e9315e061291c79360e12
-05/03/2016 13:22:25: 		Built by svcphil on cntk-muc01
-05/03/2016 13:22:25: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
-05/03/2016 13:22:25: -------------------------------------------------------------------
+08/16/2016 03:20:17: 		Built time: Aug 16 2016 03:09:16
+08/16/2016 03:20:17: 		Last modified date: Fri Aug 12 05:28:23 2016
+08/16/2016 03:20:17: 		Build type: Release
+08/16/2016 03:20:17: 		Build target: GPU
+08/16/2016 03:20:17: 		With 1bit-SGD: yes
+08/16/2016 03:20:17: 		Math lib: mkl
+08/16/2016 03:20:17: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/16/2016 03:20:17: 		CUB_PATH: c:\src\cub-1.4.1
+08/16/2016 03:20:17: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
+08/16/2016 03:20:17: 		Build Branch: HEAD
+08/16/2016 03:20:17: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 03:20:17: 		Built by svcphil on Philly-Pool1
+08/16/2016 03:20:17: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+08/16/2016 03:20:17: -------------------------------------------------------------------
+08/16/2016 03:20:19: -------------------------------------------------------------------
+08/16/2016 03:20:19: GPU info:

-05/03/2016 13:22:25: Running on DPHAIM-22 at 2016/05/03 13:22:25
-05/03/2016 13:22:25: Command line: 
-C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]
+08/16/2016 03:20:19: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:19: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:19: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:19: -------------------------------------------------------------------
+
+08/16/2016 03:20:19: Running on DPHAIM-25 at 2016/08/16 03:20:19
+08/16/2016 03:20:19: Command line: 
+C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]  speechTrain=[reader=[useMersenneTwisterRand=true]]



-05/03/2016 13:22:25: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-05/03/2016 13:22:25: RootDir = ".."
+08/16/2016 03:20:19: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:19: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
@ -65,7 +79,7 @@ speechTrain = [
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -111,35 +125,36 @@ speechTrain = [
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]

-05/03/2016 13:22:25: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:19: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<

-05/03/2016 13:22:25: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-05/03/2016 13:22:25: RootDir = ".."
+08/16/2016 03:20:19: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:19: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
-ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models"
+ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = "1"
-modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn"
+modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn"
 parallelTrain = true
 speechTrain = [
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -185,36 +200,37 @@ speechTrain = [
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]

-05/03/2016 13:22:25: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:19: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<

-05/03/2016 13:22:25: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:19: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: FeedForward.cntk:command=speechTrain
 configparameters: FeedForward.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 configparameters: FeedForward.cntk:currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: FeedForward.cntk:DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: FeedForward.cntk:deviceId=0
-configparameters: FeedForward.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models
-configparameters: FeedForward.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
-configparameters: FeedForward.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+configparameters: FeedForward.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models
+configparameters: FeedForward.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
+configparameters: FeedForward.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 configparameters: FeedForward.cntk:parallelTrain=true
 configparameters: FeedForward.cntk:precision=float
 configparameters: FeedForward.cntk:RootDir=..
-configparameters: FeedForward.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+configparameters: FeedForward.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 configparameters: FeedForward.cntk:speechTrain=[
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -258,24 +274,24 @@ configparameters: FeedForward.cntk:speechTrain=[
            labelType = "category"
        ]
    ]
-] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]] [reader=[useMersenneTwisterRand=true]]

 configparameters: FeedForward.cntk:timestamping=true
 configparameters: FeedForward.cntk:traceLevel=1
-05/03/2016 13:22:25: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/03/2016 13:22:25: Commands: speechTrain
-05/03/2016 13:22:25: Precision = "float"
-05/03/2016 13:22:25: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
-05/03/2016 13:22:25: CNTKCommandTrainInfo: speechTrain : 1
-05/03/2016 13:22:25: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+08/16/2016 03:20:19: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:19: Commands: speechTrain
+08/16/2016 03:20:19: Precision = "float"
+08/16/2016 03:20:19: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
+08/16/2016 03:20:19: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 03:20:19: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1

-05/03/2016 13:22:25: ##############################################################################
-05/03/2016 13:22:25: #                                                                            #
-05/03/2016 13:22:25: # Action "train"                                                             #
-05/03/2016 13:22:25: #                                                                            #
-05/03/2016 13:22:25: ##############################################################################
+08/16/2016 03:20:19: ##############################################################################
+08/16/2016 03:20:19: #                                                                            #
+08/16/2016 03:20:19: # Action "train"                                                             #
+08/16/2016 03:20:19: #                                                                            #
+08/16/2016 03:20:19: ##############################################################################

-05/03/2016 13:22:25: CNTKCommandTrainBegin: speechTrain
+08/16/2016 03:20:19: CNTKCommandTrainBegin: speechTrain
 SimpleNetworkBuilder Using GPU 0
 reading script file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp ... 948 entries
 total 132 state names in state list C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list
@ -284,14 +300,26 @@ htkmlfreader: reading MLF file C:\jenkins\workspace\CNTK-Test-Windows-W1\Example
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames

-05/03/2016 13:22:25: Creating virgin network.
+08/16/2016 03:20:19: Creating virgin network.
+Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- 0.000000.
+Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- 0.000000.
+Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
+Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
+Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- 0.000000.
+Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
+Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.

 Post-processing network...

 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -320,7 +348,7 @@ Validating --> W2*H1 = Times (W2, H2) : [132 x 512], [512 x 1 x *] -> [132 x 1 x
 Validating --> B2 = LearnableParameter() :  -> [132 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 x *] -> [132 x 1 x *]
 Validating --> Prior = Mean (labels) : [132 x *] -> [132]
 Validating --> LogOfPrior = Log (Prior) : [132] -> [132]
@ -337,70 +365,70 @@ Validating network, final pass.

 Post-processing network complete.

-05/03/2016 13:22:26: Created model with 25 nodes on GPU 0.
+08/16/2016 03:20:20: Created model with 25 nodes on GPU 0.

-05/03/2016 13:22:26: Training criterion node(s):
-05/03/2016 13:22:26: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+08/16/2016 03:20:20: Training criterion node(s):
+08/16/2016 03:20:20: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax

-05/03/2016 13:22:26: Evaluation criterion node(s):
-
-05/03/2016 13:22:26: 	EvalClassificationError = ClassificationError
+08/16/2016 03:20:20: Evaluation criterion node(s):
+08/16/2016 03:20:20: 	EvalErrorPrediction = ErrorPrediction


 Allocating matrices for forward and/or backward propagation.

-Memory Sharing Structure:
+Memory Sharing: Out of 40 matrices, 19 are shared as 8, and 21 are not shared.

-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[363]] [LogOfPrior Gradient[132]] [MVNormalizedFeatures Gradient[363 x *]] [MeanOfFeatures Gradient[363]] [PosteriorProb Gradient[132 x 1 x *]] [PosteriorProb Value[132 x 1 x *]] [Prior Gradient[132]] [ScaledLogLikelihood Gradient[132 x 1 x *]] [features Gradient[363 x *]] [labels Gradient[132 x *]] }
-00000087D360C610: {[features Value[363 x *]] }
-00000087EB4FEEF0: {[W0 Value[512 x 363]] }
-00000087EB4FF530: {[B1 Value[512 x 1]] }
-00000087EB4FF850: {[W1 Value[512 x 512]] }
-00000087EB4FFC10: {[W2 Value[132 x 512]] }
-00000087EB500070: {[B2 Value[132 x 1]] }
-00000087EB5001B0: {[MeanOfFeatures Value[363]] }
-00000087EB500250: {[InvStdOfFeatures Value[363]] }
-00000087EB5004D0: {[B0 Value[512 x 1]] }
-00000087EDA2B150: {[labels Value[132 x *]] }
-00000087EDA2B330: {[B1 Gradient[512 x 1]] [H2 Gradient[512 x 1 x *]] [HLast Gradient[132 x 1 x *]] }
-00000087EDA2B3D0: {[Prior Value[132]] }
-00000087EDA2B6F0: {[HLast Value[132 x 1 x *]] [W2 Gradient[132 x 512]] }
-00000087EDA2B8D0: {[W0 Gradient[512 x 363]] [W0*features+B0 Value[512 x 1 x *]] }
-00000087EDA2BB50: {[CrossEntropyWithSoftmax Value[1]] }
-00000087EDA2BC90: {[W0*features+B0 Gradient[512 x 1 x *]] [W1*H1 Value[512 x 1 x *]] }
-00000087EDA2C0F0: {[EvalClassificationError Value[1]] }
-00000087EDA2C190: {[W0*features Value[512 x *]] }
-00000087EDA2C2D0: {[H1 Value[512 x 1 x *]] [W0*features Gradient[512 x *]] }
-00000087EDA2C370: {[W2*H1 Gradient[132 x 1 x *]] }
-00000087EDA2C410: {[B2 Gradient[132 x 1]] }
-00000087EDA2C730: {[ScaledLogLikelihood Value[132 x 1 x *]] }
-00000087EDA2C7D0: {[LogOfPrior Value[132]] }
-00000087EDA2CAF0: {[MVNormalizedFeatures Value[363 x *]] }
-00000087EDA2CB90: {[H2 Value[512 x 1 x *]] [W1*H1 Gradient[512 x 1 x *]] }
-00000087EDA2CCD0: {[B0 Gradient[512 x 1]] [H1 Gradient[512 x 1 x *]] [W1*H1+B1 Gradient[512 x 1 x *]] [W2*H1 Value[132 x 1 x *]] }
-00000087EDA2CEB0: {[CrossEntropyWithSoftmax Gradient[1]] }
-00000087EDA2CFF0: {[W1 Gradient[512 x 512]] [W1*H1+B1 Value[512 x 1 x *]] }
+	{ W0*features+B0 : [512 x 1 x *] (gradient)
+	  W1*H1 : [512 x 1 x *] }
+	{ H2 : [512 x 1 x *]
+	  W1*H1 : [512 x 1 x *] (gradient) }
+	{ HLast : [132 x 1 x *]
+	  W2 : [132 x 512] (gradient) }
+	{ W0 : [512 x 363] (gradient)
+	  W0*features+B0 : [512 x 1 x *] }
+	{ B0 : [512 x 1] (gradient)
+	  H1 : [512 x 1 x *] (gradient)
+	  W1*H1+B1 : [512 x 1 x *] (gradient)
+	  W2*H1 : [132 x 1 x *] }
+	{ H1 : [512 x 1 x *]
+	  W0*features : [512 x *] (gradient) }
+	{ W1 : [512 x 512] (gradient)
+	  W1*H1+B1 : [512 x 1 x *] }
+	{ B1 : [512 x 1] (gradient)
+	  H2 : [512 x 1 x *] (gradient)
+	  HLast : [132 x 1 x *] (gradient) }


-05/03/2016 13:22:26: Precomputing --> 3 PreCompute nodes found.
+08/16/2016 03:20:20: Training 516740 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:

-05/03/2016 13:22:26: 	MeanOfFeatures = Mean()
-05/03/2016 13:22:26: 	InvStdOfFeatures = InvStdDev()
-05/03/2016 13:22:26: 	Prior = Mean()
+08/16/2016 03:20:20: 	Node 'B0' (LearnableParameter operation) : [512 x 1]
+08/16/2016 03:20:20: 	Node 'B1' (LearnableParameter operation) : [512 x 1]
+08/16/2016 03:20:20: 	Node 'B2' (LearnableParameter operation) : [132 x 1]
+08/16/2016 03:20:20: 	Node 'W0' (LearnableParameter operation) : [512 x 363]
+08/16/2016 03:20:20: 	Node 'W1' (LearnableParameter operation) : [512 x 512]
+08/16/2016 03:20:20: 	Node 'W2' (LearnableParameter operation) : [132 x 512]
+
+
+08/16/2016 03:20:20: Precomputing --> 3 PreCompute nodes found.
+
+08/16/2016 03:20:20: 	MeanOfFeatures = Mean()
+08/16/2016 03:20:20: 	InvStdOfFeatures = InvStdDev()
+08/16/2016 03:20:20: 	Prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms

-05/03/2016 13:22:27: Precomputing --> Completed.
+08/16/2016 03:20:21: Precomputing --> Completed.


-05/03/2016 13:22:27: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
+08/16/2016 03:20:21: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
 minibatchiterator: epoch 0: frames [0..2048] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses

-05/03/2016 13:22:27: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
-05/03/2016 13:22:27: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.42832291 * 2048; EvalClassificationError = 0.91357422 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.052947s
-05/03/2016 13:22:27: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn'
-05/03/2016 13:22:27: CNTKCommandTrainEnd: speechTrain
+08/16/2016 03:20:21: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
+08/16/2016 03:20:21: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.41144794 * 2048; EvalErrorPrediction = 0.92773438 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.05551s
+08/16/2016 03:20:21: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn'
+08/16/2016 03:20:21: CNTKCommandTrainEnd: speechTrain

-05/03/2016 13:22:27: Action "train" complete.
+08/16/2016 03:20:21: Action "train" complete.

-05/03/2016 13:22:27: __COMPLETED__
+08/16/2016 03:20:21: __COMPLETED__
+~MPIWrapper
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/run-test
@ -5,5 +5,5 @@
 ConfigDir=$TEST_DIR/../../../../../../Examples/Speech/AN4/Config

 # cntkrun <CNTK config file name> <additional CNTK args>
-cntkrun FeedForward.cntk "speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]]" || exit $?
+cntkrun FeedForward.cntk "speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]" || exit $?

--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.cpu.txt
@ -0,0 +1,682 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 264172964 kB
+-------------------------------------------------------------------
+=== Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/LSTM-NDL.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 16 2016 09:41:57
+		Last modified date: Mon Aug 15 23:39:17 2016
+		Build type: release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: /usr/local/cuda-7.5
+		CUB_PATH: /usr/local/cub-1.4.1
+		CUDNN_PATH: /usr/local/cudnn-4.0
+		Build Branch: HEAD
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by philly on 643085f7f8c2
+		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+-------------------------------------------------------------------
+Changed current directory to /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+08/16/2016 10:01:47: -------------------------------------------------------------------
+08/16/2016 10:01:47: Build info: 
+
+08/16/2016 10:01:47: 		Built time: Aug 16 2016 09:41:57
+08/16/2016 10:01:47: 		Last modified date: Mon Aug 15 23:39:17 2016
+08/16/2016 10:01:47: 		Build type: release
+08/16/2016 10:01:47: 		Build target: GPU
+08/16/2016 10:01:47: 		With 1bit-SGD: yes
+08/16/2016 10:01:47: 		Math lib: mkl
+08/16/2016 10:01:47: 		CUDA_PATH: /usr/local/cuda-7.5
+08/16/2016 10:01:47: 		CUB_PATH: /usr/local/cub-1.4.1
+08/16/2016 10:01:47: 		CUDNN_PATH: /usr/local/cudnn-4.0
+08/16/2016 10:01:47: 		Build Branch: HEAD
+08/16/2016 10:01:47: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 10:01:47: 		Built by philly on 643085f7f8c2
+08/16/2016 10:01:47: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+08/16/2016 10:01:47: -------------------------------------------------------------------
+08/16/2016 10:01:47: -------------------------------------------------------------------
+08/16/2016 10:01:47: GPU info:
+
+08/16/2016 10:01:47: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:47: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:47: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:47: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:01:47: -------------------------------------------------------------------
+
+08/16/2016 10:01:47: Running on localhost at 2016/08/16 10:01:47
+08/16/2016 10:01:47: Command line: 
+/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/LSTM-NDL.cntk  currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu  DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config  OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  speechTrain=[reader=[useMersenneTwisterRand=true]]  parallelTrain=false
+
+
+
+08/16/2016 10:01:47: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:01:47: RootDir = ".."
+ConfigDir = "$RootDir$/Config"
+DataDir = "$RootDir$/Data"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = 1
+modelPath = "$ModelDir$/cntkSpeechLSTM.dnn"
+parallelTrain = true
+frameMode = false
+truncated = true
+speechTrain = [
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "$DataDir$/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
+DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
+OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
+DeviceId=-1
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=64]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+parallelTrain=false
+
+08/16/2016 10:01:47: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 10:01:47: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:01:47: RootDir = ".."
+ConfigDir = "../Config"
+DataDir = "../Data"
+OutputDir = "../Output"
+ModelDir = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = 1
+modelPath = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn"
+parallelTrain = true
+frameMode = false
+truncated = true
+speechTrain = [
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
+            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
+DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
+OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
+DeviceId=-1
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=64]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+parallelTrain=false
+
+08/16/2016 10:01:47: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 10:01:47: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: LSTM-NDL.cntk:command=speechTrain
+configparameters: LSTM-NDL.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
+configparameters: LSTM-NDL.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+configparameters: LSTM-NDL.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+configparameters: LSTM-NDL.cntk:deviceId=-1
+configparameters: LSTM-NDL.cntk:frameMode=false
+configparameters: LSTM-NDL.cntk:ModelDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models
+configparameters: LSTM-NDL.cntk:modelPath=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn
+configparameters: LSTM-NDL.cntk:OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
+configparameters: LSTM-NDL.cntk:parallelTrain=false
+configparameters: LSTM-NDL.cntk:precision=float
+configparameters: LSTM-NDL.cntk:RootDir=..
+configparameters: LSTM-NDL.cntk:RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
+configparameters: LSTM-NDL.cntk:speechTrain=[
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
+            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=64]] [reader=[useMersenneTwisterRand=true]]
+
+configparameters: LSTM-NDL.cntk:timestamping=true
+configparameters: LSTM-NDL.cntk:traceLevel=1
+configparameters: LSTM-NDL.cntk:truncated=true
+08/16/2016 10:01:47: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:01:47: Commands: speechTrain
+08/16/2016 10:01:47: Precision = "float"
+08/16/2016 10:01:47: CNTKModelPath: /tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn
+08/16/2016 10:01:47: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 10:01:47: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+
+08/16/2016 10:01:47: ##############################################################################
+08/16/2016 10:01:47: #                                                                            #
+08/16/2016 10:01:47: # Action "train"                                                             #
+08/16/2016 10:01:47: #                                                                            #
+08/16/2016 10:01:47: ##############################################################################
+
+08/16/2016 10:01:47: CNTKCommandTrainBegin: speechTrain
+NDLBuilder Using CPU
+reading script file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp ... 948 entries
+total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list
+htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+useParallelTrain option is not enabled. ParallelTrain config will be ignored.
+08/16/2016 10:01:48: Creating virgin network.
+Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=4, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=5, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=6, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=9, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=10, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=11, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=12, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=15, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=16, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=17, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=18, range=0.050000*1.000000, onCPU=false).
+Node 'W' (LearnableParameter operation): Initializating Parameter[132 x 0] as uniform later when dimensions are fully known.
+Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+
+Post-processing network...
+
+6 roots:
+	ce = CrossEntropyWithSoftmax()
+	err = ErrorPrediction()
+	featNorm.xMean = Mean()
+	featNorm.xStdDev = InvStdDev()
+	logPrior.prior = Mean()
+	scaledLogLikelihood = Minus()
+
+Loop[0] --> Loop_LSTMoutput1.output -> 24 nodes
+
+	LSTMoutput1.dh	LSTMoutput1.whh	LSTMoutput1.wxxpbpwhh
+	LSTMoutput1.G4	LSTMoutput1.G3	LSTMoutput1.dc
+	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft
+	LSTMoutput1.bft	LSTMoutput1.G1	LSTMoutput1.Wcidc
+	LSTMoutput1.unnamed163	LSTMoutput1.it	LSTMoutput1.G2
+	LSTMoutput1.unnamed164	LSTMoutput1.bit	LSTMoutput1.ct
+	LSTMoutput1.Wcoct	LSTMoutput1.unnamed166	LSTMoutput1.ot
+	LSTMoutput1.unnamed167	LSTMoutput1.mt	LSTMoutput1.output
+
+Loop[1] --> Loop_LSTMoutput2.output -> 24 nodes
+
+	LSTMoutput2.dh	LSTMoutput2.whh	LSTMoutput2.wxxpbpwhh
+	LSTMoutput2.G4	LSTMoutput2.G3	LSTMoutput2.dc
+	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed175	LSTMoutput2.ft
+	LSTMoutput2.bft	LSTMoutput2.G1	LSTMoutput2.Wcidc
+	LSTMoutput2.unnamed173	LSTMoutput2.it	LSTMoutput2.G2
+	LSTMoutput2.unnamed174	LSTMoutput2.bit	LSTMoutput2.ct
+	LSTMoutput2.Wcoct	LSTMoutput2.unnamed176	LSTMoutput2.ot
+	LSTMoutput2.unnamed177	LSTMoutput2.mt	LSTMoutput2.output
+
+Loop[2] --> Loop_LSTMoutput3.output -> 24 nodes
+
+	LSTMoutput3.dh	LSTMoutput3.whh	LSTMoutput3.wxxpbpwhh
+	LSTMoutput3.G4	LSTMoutput3.G3	LSTMoutput3.dc
+	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed185	LSTMoutput3.ft
+	LSTMoutput3.bft	LSTMoutput3.G1	LSTMoutput3.Wcidc
+	LSTMoutput3.unnamed183	LSTMoutput3.it	LSTMoutput3.G2
+	LSTMoutput3.unnamed184	LSTMoutput3.bit	LSTMoutput3.ct
+	LSTMoutput3.Wcoct	LSTMoutput3.unnamed186	LSTMoutput3.ot
+	LSTMoutput3.unnamed187	LSTMoutput3.mt	LSTMoutput3.output
+
+Validating network. 113 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 x *]
+Validating --> W = LearnableParameter() :  -> [132 x 0]
+Validating --> LSTMoutput3.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput3.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput2.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput2.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput1.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput1.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> features = InputValue() :  -> [363 x *]
+Validating --> featNorm.xMean = Mean (features) : [363 x *] -> [363]
+Validating --> featNorm.xStdDev = InvStdDev (features) : [363 x *] -> [363]
+Validating --> featNorm.xNorm = PerDimMeanVarNormalization (features, featNorm.xMean, featNorm.xStdDev) : [363 x *], [363], [363] -> [363 x *]
+Node 'LSTMoutput1.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 363].
+Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializing Parameter[4096 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput1.wxx = Times (LSTMoutput1.wx, featNorm.xNorm) : [4096 x 363], [363 x *] -> [4096 x *]
+Validating --> LSTMoutput1.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput1.wxxpb = Plus (LSTMoutput1.wxx, LSTMoutput1.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput1.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput1.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput1.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput1.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput1.wxxpbpwhh = Plus (LSTMoutput1.wxxpb, LSTMoutput1.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.G4 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G3 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput1.unnamed165 = Plus (LSTMoutput1.G3, LSTMoutput1.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ft = Sigmoid (LSTMoutput1.unnamed165) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.bft = ElementTimes (LSTMoutput1.ft, LSTMoutput1.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G1 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput1.unnamed163 = Plus (LSTMoutput1.G1, LSTMoutput1.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.it = Sigmoid (LSTMoutput1.unnamed163) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G2 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed164 = Tanh (LSTMoutput1.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.bit = ElementTimes (LSTMoutput1.it, LSTMoutput1.unnamed164) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ct = Plus (LSTMoutput1.bft, LSTMoutput1.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcoct = DiagTimes (LSTMoutput1.Wco, LSTMoutput1.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed166 = Plus (LSTMoutput1.G4, LSTMoutput1.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ot = Sigmoid (LSTMoutput1.unnamed166) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed167 = Tanh (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.mt = ElementTimes (LSTMoutput1.ot, LSTMoutput1.unnamed167) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.output = Times (LSTMoutput1.Wmr, LSTMoutput1.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'LSTMoutput2.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
+Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=7, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput2.wxx = Times (LSTMoutput2.wx, LSTMoutput1.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
+Validating --> LSTMoutput2.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput2.wxxpb = Plus (LSTMoutput2.wxx, LSTMoutput2.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput2.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput2.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput2.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput2.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=8, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput2.wxxpbpwhh = Plus (LSTMoutput2.wxxpb, LSTMoutput2.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.G4 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G3 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput2.unnamed175 = Plus (LSTMoutput2.G3, LSTMoutput2.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ft = Sigmoid (LSTMoutput2.unnamed175) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.bft = ElementTimes (LSTMoutput2.ft, LSTMoutput2.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G1 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput2.unnamed173 = Plus (LSTMoutput2.G1, LSTMoutput2.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.it = Sigmoid (LSTMoutput2.unnamed173) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G2 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed174 = Tanh (LSTMoutput2.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.bit = ElementTimes (LSTMoutput2.it, LSTMoutput2.unnamed174) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ct = Plus (LSTMoutput2.bft, LSTMoutput2.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcoct = DiagTimes (LSTMoutput2.Wco, LSTMoutput2.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed176 = Plus (LSTMoutput2.G4, LSTMoutput2.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ot = Sigmoid (LSTMoutput2.unnamed176) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed177 = Tanh (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.mt = ElementTimes (LSTMoutput2.ot, LSTMoutput2.unnamed177) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.output = Times (LSTMoutput2.Wmr, LSTMoutput2.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'LSTMoutput3.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
+Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=13, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput3.wxx = Times (LSTMoutput3.wx, LSTMoutput2.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
+Validating --> LSTMoutput3.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput3.wxxpb = Plus (LSTMoutput3.wxx, LSTMoutput3.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput3.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput3.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput3.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput3.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=14, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput3.wxxpbpwhh = Plus (LSTMoutput3.wxxpb, LSTMoutput3.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.G4 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G3 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput3.unnamed185 = Plus (LSTMoutput3.G3, LSTMoutput3.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ft = Sigmoid (LSTMoutput3.unnamed185) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.bft = ElementTimes (LSTMoutput3.ft, LSTMoutput3.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G1 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput3.unnamed183 = Plus (LSTMoutput3.G1, LSTMoutput3.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.it = Sigmoid (LSTMoutput3.unnamed183) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G2 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed184 = Tanh (LSTMoutput3.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.bit = ElementTimes (LSTMoutput3.it, LSTMoutput3.unnamed184) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ct = Plus (LSTMoutput3.bft, LSTMoutput3.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcoct = DiagTimes (LSTMoutput3.Wco, LSTMoutput3.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed186 = Plus (LSTMoutput3.G4, LSTMoutput3.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ot = Sigmoid (LSTMoutput3.unnamed186) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed187 = Tanh (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.mt = ElementTimes (LSTMoutput3.ot, LSTMoutput3.unnamed187) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.output = Times (LSTMoutput3.Wmr, LSTMoutput3.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'W' (LearnableParameter operation) operation: Tensor shape was inferred as [132 x 512 x 1].
+Node 'W' (LearnableParameter operation): Initializing Parameter[132 x 512 x 1] <- uniform(seed=19, range=0.050000*1.000000, onCPU=false).
+Validating --> unnamed193 = Times (W, LSTMoutput3.output) : [132 x 512 x 1], [512 x 1 x *] -> [132 x *]
+Validating --> b = LearnableParameter() :  -> [132 x 1]
+Validating --> LSTMoutputW = Plus (unnamed193, b) : [132 x *], [132 x 1] -> [132 x 1 x *]
+Validating --> ce = CrossEntropyWithSoftmax (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> logPrior.prior = Mean (labels) : [132 x *] -> [132]
+Validating --> logPrior.logPrior = Log (logPrior.prior) : [132] -> [132]
+Validating --> scaledLogLikelihood = Minus (LSTMoutputW, logPrior.logPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
+
+Validating network. 88 nodes to process in pass 2.
+
+Validating --> LSTMoutput1.dh = PastValue (LSTMoutput1.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.dc = PastValue (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.dh = PastValue (LSTMoutput2.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.dc = PastValue (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.dh = PastValue (LSTMoutput3.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.dc = PastValue (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+
+Validating network. 15 nodes to process in pass 3.
+
+
+Validating network, final pass.
+
+
+
+29 out of 113 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/16/2016 10:01:48: Created model with 113 nodes on CPU.
+
+08/16/2016 10:01:48: Training criterion node(s):
+08/16/2016 10:01:48: 	ce = CrossEntropyWithSoftmax
+
+08/16/2016 10:01:48: Evaluation criterion node(s):
+08/16/2016 10:01:48: 	err = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 217 matrices, 125 are shared as 56, and 92 are not shared.
+
+	{ LSTMoutput1.dh : [512 x 1 x *]
+	  LSTMoutput1.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput2.Wco : [1024] (gradient)
+	  LSTMoutput3.dc : [1024 x 1 x *] }
+	{ LSTMoutput1.Wmr : [512 x 1024] (gradient)
+	  LSTMoutput2.wxx : [4096 x *] }
+	{ LSTMoutput2.wx : [4096 x 512 x 1] (gradient)
+	  LSTMoutput2.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput1.ot : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.whh : [4096 x 1 x *] }
+	{ LSTMoutput1.ct : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] }
+	{ LSTMoutput1.G4 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G4 : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed164 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcfdc : [1024 x 1 x *] }
+	{ LSTMoutput1.wxxpbpwhh : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed175 : [1024 x 1 x *] }
+	{ LSTMoutput1.G1 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.ft : [1024 x 1 x *] }
+	{ LSTMoutput1.Wci : [1024] (gradient)
+	  LSTMoutput2.G1 : [1024 x 1 x *] }
+	{ LSTMoutput1.G3 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcidc : [1024 x 1 x *] }
+	{ LSTMoutput1.Wcf : [1024] (gradient)
+	  LSTMoutput2.it : [1024 x 1 x *] }
+	{ LSTMoutput1.whh : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.G2 : [1024 x 1 x *] }
+	{ LSTMoutput1.b : [4096 x 1] (gradient)
+	  LSTMoutput1.dh : [512 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed174 : [1024 x 1 x *] }
+	{ LSTMoutput2.Wmr : [512 x 1024] (gradient)
+	  LSTMoutput3.wxx : [4096 x *] }
+	{ LSTMoutput3.wx : [4096 x 512 x 1] (gradient)
+	  LSTMoutput3.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput2.ot : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.whh : [4096 x 1 x *] }
+	{ LSTMoutput2.ct : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] }
+	{ LSTMoutput1.Wcoct : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G4 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G4 : [1024 x 1 x *] }
+	{ LSTMoutput2.unnamed174 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcfdc : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed166 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed185 : [1024 x 1 x *] }
+	{ LSTMoutput1.dc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G1 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.ft : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed165 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.bft : [1024 x 1 x *] }
+	{ LSTMoutput2.Wci : [1024] (gradient)
+	  LSTMoutput3.G1 : [1024 x 1 x *] }
+	{ LSTMoutput2.G3 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcidc : [1024 x 1 x *] }
+	{ LSTMoutput1.it : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed183 : [1024 x 1 x *] }
+	{ LSTMoutput2.Wcf : [1024] (gradient)
+	  LSTMoutput3.it : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed167 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.whh : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.G2 : [1024 x 1 x *] }
+	{ LSTMoutput2.b : [4096 x 1] (gradient)
+	  LSTMoutput2.dh : [512 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed184 : [1024 x 1 x *] }
+	{ LSTMoutput3.Wmr : [512 x 1024] (gradient)
+	  unnamed193 : [132 x *] }
+	{ LSTMoutputW : [132 x 1 x *]
+	  W : [132 x 512 x 1] (gradient) }
+	{ LSTMoutput1.mt : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.dh : [512 x 1 x *]
+	  LSTMoutput2.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput1.wx : [4096 x 363] (gradient)
+	  LSTMoutput1.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput2.mt : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.dh : [512 x 1 x *]
+	  LSTMoutput3.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput3.output : [512 x 1 x *] (gradient)
+	  LSTMoutputW : [132 x 1 x *] (gradient) }
+	{ LSTMoutput3.mt : [1024 x 1 x *] (gradient)
+	  unnamed193 : [132 x *] (gradient) }
+	{ LSTMoutput2.Wcoct : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G4 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.ft : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.bft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.output : [512 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpb : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.it : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.Wh : [4096 x 512] (gradient)
+	  LSTMoutput3.G2 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed176 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput1.bit : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed183 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.bft : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.dc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G1 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.G2 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcfdc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcidc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.unnamed163 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed175 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wcidc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.ft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.bft : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.dc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wcfdc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcidc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.ft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed173 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed185 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wh : [4096 x 512] (gradient)
+	  LSTMoutput2.G2 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcfdc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.wxxpb : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.it : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.output : [512 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpb : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed177 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.whh : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput3.b : [4096 x 1] (gradient)
+	  LSTMoutput3.dh : [512 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wco : [1024] (gradient)
+	  LSTMoutput2.dc : [1024 x 1 x *] }
+
+
+08/16/2016 10:01:48: Training 13634692 parameters in 23 out of 23 parameter tensors and 104 nodes with gradient:
+
+08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput1.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 10:01:48: 	Node 'LSTMoutput1.wx' (LearnableParameter operation) : [4096 x 363]
+08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput2.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 10:01:48: 	Node 'LSTMoutput2.wx' (LearnableParameter operation) : [4096 x 512 x 1]
+08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 10:01:48: 	Node 'LSTMoutput3.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 10:01:48: 	Node 'LSTMoutput3.wx' (LearnableParameter operation) : [4096 x 512 x 1]
+08/16/2016 10:01:48: 	Node 'W' (LearnableParameter operation) : [132 x 512 x 1]
+08/16/2016 10:01:48: 	Node 'b' (LearnableParameter operation) : [132 x 1]
+
+
+08/16/2016 10:01:48: Precomputing --> 3 PreCompute nodes found.
+
+08/16/2016 10:01:48: 	featNorm.xMean = Mean()
+08/16/2016 10:01:48: 	featNorm.xStdDev = InvStdDev()
+08/16/2016 10:01:48: 	logPrior.prior = Mean()
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+08/16/2016 10:01:49: Precomputing --> Completed.
+
+
+08/16/2016 10:01:50: Starting Epoch 1: learning rate per sample = 0.001953  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+minibatchiterator: epoch 0: frames [0..64] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+08/16/2016 10:01:50: Starting minibatch loop.
+08/16/2016 10:01:53:  Epoch[ 1 of 1]-Minibatch[   1-  10, 250.00%]: ce = 4.87313957 * 160; err = 0.90625000 * 160; time = 3.3910s; samplesPerSecond = 47.2
+08/16/2016 10:01:56:  Epoch[ 1 of 1]-Minibatch[  11-  20, 500.00%]: ce = 4.84521751 * 160; err = 0.69375000 * 160; time = 2.9626s; samplesPerSecond = 54.0
+08/16/2016 10:01:58: Finished Epoch[ 1 of 1]: [Training] ce = 4.85644356 * 418; err = 0.80382775 * 418; totalSamplesSeen = 418; learningRatePerSample = 0.001953125; epochTime=8.39953s
+08/16/2016 10:01:59: SGD: Saving checkpoint model '/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn'
+08/16/2016 10:01:59: CNTKCommandTrainEnd: speechTrain
+
+08/16/2016 10:01:59: Action "train" complete.
+
+08/16/2016 10:01:59: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.debug.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.debug.cpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.debug.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.debug.gpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.gpu.txt
@ -0,0 +1,683 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 264172964 kB
+-------------------------------------------------------------------
+=== Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/LSTM-NDL.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 16 2016 09:41:57
+		Last modified date: Mon Aug 15 23:39:17 2016
+		Build type: release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: /usr/local/cuda-7.5
+		CUB_PATH: /usr/local/cub-1.4.1
+		CUDNN_PATH: /usr/local/cudnn-4.0
+		Build Branch: HEAD
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by philly on 643085f7f8c2
+		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+-------------------------------------------------------------------
+Changed current directory to /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+08/16/2016 10:02:00: -------------------------------------------------------------------
+08/16/2016 10:02:00: Build info: 
+
+08/16/2016 10:02:00: 		Built time: Aug 16 2016 09:41:57
+08/16/2016 10:02:00: 		Last modified date: Mon Aug 15 23:39:17 2016
+08/16/2016 10:02:00: 		Build type: release
+08/16/2016 10:02:00: 		Build target: GPU
+08/16/2016 10:02:00: 		With 1bit-SGD: yes
+08/16/2016 10:02:00: 		Math lib: mkl
+08/16/2016 10:02:00: 		CUDA_PATH: /usr/local/cuda-7.5
+08/16/2016 10:02:00: 		CUB_PATH: /usr/local/cub-1.4.1
+08/16/2016 10:02:00: 		CUDNN_PATH: /usr/local/cudnn-4.0
+08/16/2016 10:02:00: 		Build Branch: HEAD
+08/16/2016 10:02:00: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 10:02:00: 		Built by philly on 643085f7f8c2
+08/16/2016 10:02:00: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+08/16/2016 10:02:00: -------------------------------------------------------------------
+08/16/2016 10:02:01: -------------------------------------------------------------------
+08/16/2016 10:02:01: GPU info:
+
+08/16/2016 10:02:01: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:02:01: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:02:01: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:02:01: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
+08/16/2016 10:02:01: -------------------------------------------------------------------
+
+08/16/2016 10:02:01: Running on localhost at 2016/08/16 10:02:01
+08/16/2016 10:02:01: Command line: 
+/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/LSTM-NDL.cntk  currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu  DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config  OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  speechTrain=[reader=[useMersenneTwisterRand=true]]  parallelTrain=false
+
+
+
+08/16/2016 10:02:01: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:02:01: RootDir = ".."
+ConfigDir = "$RootDir$/Config"
+DataDir = "$RootDir$/Data"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = 1
+modelPath = "$ModelDir$/cntkSpeechLSTM.dnn"
+parallelTrain = true
+frameMode = false
+truncated = true
+speechTrain = [
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "$DataDir$/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
+DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
+OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
+DeviceId=0
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=64]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+parallelTrain=false
+
+08/16/2016 10:02:01: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 10:02:01: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:02:01: RootDir = ".."
+ConfigDir = "../Config"
+DataDir = "../Data"
+OutputDir = "../Output"
+ModelDir = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = 1
+modelPath = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn"
+parallelTrain = true
+frameMode = false
+truncated = true
+speechTrain = [
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
+            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
+DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
+OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
+DeviceId=0
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=64]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+parallelTrain=false
+
+08/16/2016 10:02:01: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 10:02:01: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: LSTM-NDL.cntk:command=speechTrain
+configparameters: LSTM-NDL.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
+configparameters: LSTM-NDL.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+configparameters: LSTM-NDL.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
+configparameters: LSTM-NDL.cntk:deviceId=0
+configparameters: LSTM-NDL.cntk:frameMode=false
+configparameters: LSTM-NDL.cntk:ModelDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models
+configparameters: LSTM-NDL.cntk:modelPath=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn
+configparameters: LSTM-NDL.cntk:OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
+configparameters: LSTM-NDL.cntk:parallelTrain=false
+configparameters: LSTM-NDL.cntk:precision=float
+configparameters: LSTM-NDL.cntk:RootDir=..
+configparameters: LSTM-NDL.cntk:RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
+configparameters: LSTM-NDL.cntk:speechTrain=[
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
+            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=64]] [reader=[useMersenneTwisterRand=true]]
+
+configparameters: LSTM-NDL.cntk:timestamping=true
+configparameters: LSTM-NDL.cntk:traceLevel=1
+configparameters: LSTM-NDL.cntk:truncated=true
+08/16/2016 10:02:01: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:02:01: Commands: speechTrain
+08/16/2016 10:02:01: Precision = "float"
+08/16/2016 10:02:01: CNTKModelPath: /tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn
+08/16/2016 10:02:01: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 10:02:01: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+
+08/16/2016 10:02:01: ##############################################################################
+08/16/2016 10:02:01: #                                                                            #
+08/16/2016 10:02:01: # Action "train"                                                             #
+08/16/2016 10:02:01: #                                                                            #
+08/16/2016 10:02:01: ##############################################################################
+
+08/16/2016 10:02:01: CNTKCommandTrainBegin: speechTrain
+NDLBuilder Using GPU 0
+reading script file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp ... 948 entries
+total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list
+htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+useParallelTrain option is not enabled. ParallelTrain config will be ignored.
+08/16/2016 10:02:01: Creating virgin network.
+Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
+SetUniformRandomValue (GPU): creating curand object with seed 3, sizeof(ElemType)==4
+Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=4, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=5, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=6, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=9, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=10, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=11, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=12, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=15, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=16, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=17, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=18, range=0.050000*1.000000, onCPU=false).
+Node 'W' (LearnableParameter operation): Initializating Parameter[132 x 0] as uniform later when dimensions are fully known.
+Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+
+Post-processing network...
+
+6 roots:
+	ce = CrossEntropyWithSoftmax()
+	err = ErrorPrediction()
+	featNorm.xMean = Mean()
+	featNorm.xStdDev = InvStdDev()
+	logPrior.prior = Mean()
+	scaledLogLikelihood = Minus()
+
+Loop[0] --> Loop_LSTMoutput1.output -> 24 nodes
+
+	LSTMoutput1.dh	LSTMoutput1.whh	LSTMoutput1.wxxpbpwhh
+	LSTMoutput1.G4	LSTMoutput1.G3	LSTMoutput1.dc
+	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft
+	LSTMoutput1.bft	LSTMoutput1.G1	LSTMoutput1.Wcidc
+	LSTMoutput1.unnamed163	LSTMoutput1.it	LSTMoutput1.G2
+	LSTMoutput1.unnamed164	LSTMoutput1.bit	LSTMoutput1.ct
+	LSTMoutput1.Wcoct	LSTMoutput1.unnamed166	LSTMoutput1.ot
+	LSTMoutput1.unnamed167	LSTMoutput1.mt	LSTMoutput1.output
+
+Loop[1] --> Loop_LSTMoutput2.output -> 24 nodes
+
+	LSTMoutput2.dh	LSTMoutput2.whh	LSTMoutput2.wxxpbpwhh
+	LSTMoutput2.G4	LSTMoutput2.G3	LSTMoutput2.dc
+	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed175	LSTMoutput2.ft
+	LSTMoutput2.bft	LSTMoutput2.G1	LSTMoutput2.Wcidc
+	LSTMoutput2.unnamed173	LSTMoutput2.it	LSTMoutput2.G2
+	LSTMoutput2.unnamed174	LSTMoutput2.bit	LSTMoutput2.ct
+	LSTMoutput2.Wcoct	LSTMoutput2.unnamed176	LSTMoutput2.ot
+	LSTMoutput2.unnamed177	LSTMoutput2.mt	LSTMoutput2.output
+
+Loop[2] --> Loop_LSTMoutput3.output -> 24 nodes
+
+	LSTMoutput3.dh	LSTMoutput3.whh	LSTMoutput3.wxxpbpwhh
+	LSTMoutput3.G4	LSTMoutput3.G3	LSTMoutput3.dc
+	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed185	LSTMoutput3.ft
+	LSTMoutput3.bft	LSTMoutput3.G1	LSTMoutput3.Wcidc
+	LSTMoutput3.unnamed183	LSTMoutput3.it	LSTMoutput3.G2
+	LSTMoutput3.unnamed184	LSTMoutput3.bit	LSTMoutput3.ct
+	LSTMoutput3.Wcoct	LSTMoutput3.unnamed186	LSTMoutput3.ot
+	LSTMoutput3.unnamed187	LSTMoutput3.mt	LSTMoutput3.output
+
+Validating network. 113 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 x *]
+Validating --> W = LearnableParameter() :  -> [132 x 0]
+Validating --> LSTMoutput3.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput3.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput2.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput2.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput1.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput1.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> features = InputValue() :  -> [363 x *]
+Validating --> featNorm.xMean = Mean (features) : [363 x *] -> [363]
+Validating --> featNorm.xStdDev = InvStdDev (features) : [363 x *] -> [363]
+Validating --> featNorm.xNorm = PerDimMeanVarNormalization (features, featNorm.xMean, featNorm.xStdDev) : [363 x *], [363], [363] -> [363 x *]
+Node 'LSTMoutput1.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 363].
+Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializing Parameter[4096 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput1.wxx = Times (LSTMoutput1.wx, featNorm.xNorm) : [4096 x 363], [363 x *] -> [4096 x *]
+Validating --> LSTMoutput1.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput1.wxxpb = Plus (LSTMoutput1.wxx, LSTMoutput1.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput1.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput1.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput1.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput1.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput1.wxxpbpwhh = Plus (LSTMoutput1.wxxpb, LSTMoutput1.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.G4 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G3 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput1.unnamed165 = Plus (LSTMoutput1.G3, LSTMoutput1.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ft = Sigmoid (LSTMoutput1.unnamed165) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.bft = ElementTimes (LSTMoutput1.ft, LSTMoutput1.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G1 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput1.unnamed163 = Plus (LSTMoutput1.G1, LSTMoutput1.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.it = Sigmoid (LSTMoutput1.unnamed163) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G2 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed164 = Tanh (LSTMoutput1.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.bit = ElementTimes (LSTMoutput1.it, LSTMoutput1.unnamed164) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ct = Plus (LSTMoutput1.bft, LSTMoutput1.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcoct = DiagTimes (LSTMoutput1.Wco, LSTMoutput1.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed166 = Plus (LSTMoutput1.G4, LSTMoutput1.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ot = Sigmoid (LSTMoutput1.unnamed166) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed167 = Tanh (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.mt = ElementTimes (LSTMoutput1.ot, LSTMoutput1.unnamed167) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.output = Times (LSTMoutput1.Wmr, LSTMoutput1.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'LSTMoutput2.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
+Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=7, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput2.wxx = Times (LSTMoutput2.wx, LSTMoutput1.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
+Validating --> LSTMoutput2.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput2.wxxpb = Plus (LSTMoutput2.wxx, LSTMoutput2.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput2.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput2.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput2.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput2.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=8, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput2.wxxpbpwhh = Plus (LSTMoutput2.wxxpb, LSTMoutput2.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.G4 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G3 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput2.unnamed175 = Plus (LSTMoutput2.G3, LSTMoutput2.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ft = Sigmoid (LSTMoutput2.unnamed175) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.bft = ElementTimes (LSTMoutput2.ft, LSTMoutput2.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G1 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput2.unnamed173 = Plus (LSTMoutput2.G1, LSTMoutput2.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.it = Sigmoid (LSTMoutput2.unnamed173) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G2 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed174 = Tanh (LSTMoutput2.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.bit = ElementTimes (LSTMoutput2.it, LSTMoutput2.unnamed174) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ct = Plus (LSTMoutput2.bft, LSTMoutput2.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcoct = DiagTimes (LSTMoutput2.Wco, LSTMoutput2.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed176 = Plus (LSTMoutput2.G4, LSTMoutput2.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ot = Sigmoid (LSTMoutput2.unnamed176) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed177 = Tanh (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.mt = ElementTimes (LSTMoutput2.ot, LSTMoutput2.unnamed177) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.output = Times (LSTMoutput2.Wmr, LSTMoutput2.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'LSTMoutput3.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
+Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=13, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput3.wxx = Times (LSTMoutput3.wx, LSTMoutput2.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
+Validating --> LSTMoutput3.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput3.wxxpb = Plus (LSTMoutput3.wxx, LSTMoutput3.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput3.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput3.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput3.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput3.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=14, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput3.wxxpbpwhh = Plus (LSTMoutput3.wxxpb, LSTMoutput3.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.G4 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G3 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput3.unnamed185 = Plus (LSTMoutput3.G3, LSTMoutput3.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ft = Sigmoid (LSTMoutput3.unnamed185) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.bft = ElementTimes (LSTMoutput3.ft, LSTMoutput3.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G1 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput3.unnamed183 = Plus (LSTMoutput3.G1, LSTMoutput3.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.it = Sigmoid (LSTMoutput3.unnamed183) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G2 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed184 = Tanh (LSTMoutput3.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.bit = ElementTimes (LSTMoutput3.it, LSTMoutput3.unnamed184) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ct = Plus (LSTMoutput3.bft, LSTMoutput3.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcoct = DiagTimes (LSTMoutput3.Wco, LSTMoutput3.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed186 = Plus (LSTMoutput3.G4, LSTMoutput3.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ot = Sigmoid (LSTMoutput3.unnamed186) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed187 = Tanh (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.mt = ElementTimes (LSTMoutput3.ot, LSTMoutput3.unnamed187) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.output = Times (LSTMoutput3.Wmr, LSTMoutput3.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'W' (LearnableParameter operation) operation: Tensor shape was inferred as [132 x 512 x 1].
+Node 'W' (LearnableParameter operation): Initializing Parameter[132 x 512 x 1] <- uniform(seed=19, range=0.050000*1.000000, onCPU=false).
+Validating --> unnamed193 = Times (W, LSTMoutput3.output) : [132 x 512 x 1], [512 x 1 x *] -> [132 x *]
+Validating --> b = LearnableParameter() :  -> [132 x 1]
+Validating --> LSTMoutputW = Plus (unnamed193, b) : [132 x *], [132 x 1] -> [132 x 1 x *]
+Validating --> ce = CrossEntropyWithSoftmax (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> logPrior.prior = Mean (labels) : [132 x *] -> [132]
+Validating --> logPrior.logPrior = Log (logPrior.prior) : [132] -> [132]
+Validating --> scaledLogLikelihood = Minus (LSTMoutputW, logPrior.logPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
+
+Validating network. 88 nodes to process in pass 2.
+
+Validating --> LSTMoutput1.dh = PastValue (LSTMoutput1.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.dc = PastValue (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.dh = PastValue (LSTMoutput2.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.dc = PastValue (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.dh = PastValue (LSTMoutput3.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.dc = PastValue (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+
+Validating network. 15 nodes to process in pass 3.
+
+
+Validating network, final pass.
+
+
+
+29 out of 113 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/16/2016 10:02:01: Created model with 113 nodes on GPU 0.
+
+08/16/2016 10:02:01: Training criterion node(s):
+08/16/2016 10:02:01: 	ce = CrossEntropyWithSoftmax
+
+08/16/2016 10:02:01: Evaluation criterion node(s):
+08/16/2016 10:02:01: 	err = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 217 matrices, 125 are shared as 56, and 92 are not shared.
+
+	{ LSTMoutput2.mt : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.dh : [512 x 1 x *]
+	  LSTMoutput3.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput2.Wco : [1024] (gradient)
+	  LSTMoutput3.dc : [1024 x 1 x *] }
+	{ LSTMoutput1.wx : [4096 x 363] (gradient)
+	  LSTMoutput1.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput1.Wmr : [512 x 1024] (gradient)
+	  LSTMoutput2.wxx : [4096 x *] }
+	{ LSTMoutput2.wx : [4096 x 512 x 1] (gradient)
+	  LSTMoutput2.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput1.ot : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.whh : [4096 x 1 x *] }
+	{ LSTMoutput1.ct : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] }
+	{ LSTMoutput1.G4 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G4 : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed164 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcfdc : [1024 x 1 x *] }
+	{ LSTMoutput1.wxxpbpwhh : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed175 : [1024 x 1 x *] }
+	{ LSTMoutput1.G1 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.ft : [1024 x 1 x *] }
+	{ LSTMoutput1.Wci : [1024] (gradient)
+	  LSTMoutput2.G1 : [1024 x 1 x *] }
+	{ LSTMoutput1.G3 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcidc : [1024 x 1 x *] }
+	{ LSTMoutput1.Wcf : [1024] (gradient)
+	  LSTMoutput2.it : [1024 x 1 x *] }
+	{ LSTMoutput1.whh : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.G2 : [1024 x 1 x *] }
+	{ LSTMoutput1.b : [4096 x 1] (gradient)
+	  LSTMoutput1.dh : [512 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed174 : [1024 x 1 x *] }
+	{ LSTMoutput2.Wmr : [512 x 1024] (gradient)
+	  LSTMoutput3.wxx : [4096 x *] }
+	{ LSTMoutput3.wx : [4096 x 512 x 1] (gradient)
+	  LSTMoutput3.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput2.ot : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.whh : [4096 x 1 x *] }
+	{ LSTMoutput2.ct : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] }
+	{ LSTMoutput1.Wcoct : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G4 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G4 : [1024 x 1 x *] }
+	{ LSTMoutput2.unnamed174 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcfdc : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed166 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed185 : [1024 x 1 x *] }
+	{ LSTMoutput1.dc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G1 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.ft : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed165 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.bft : [1024 x 1 x *] }
+	{ LSTMoutput2.Wci : [1024] (gradient)
+	  LSTMoutput3.G1 : [1024 x 1 x *] }
+	{ LSTMoutput2.G3 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcidc : [1024 x 1 x *] }
+	{ LSTMoutput1.it : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed183 : [1024 x 1 x *] }
+	{ LSTMoutput2.Wcf : [1024] (gradient)
+	  LSTMoutput3.it : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed167 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.whh : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.G2 : [1024 x 1 x *] }
+	{ LSTMoutput2.b : [4096 x 1] (gradient)
+	  LSTMoutput2.dh : [512 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed184 : [1024 x 1 x *] }
+	{ LSTMoutput3.Wmr : [512 x 1024] (gradient)
+	  unnamed193 : [132 x *] }
+	{ LSTMoutputW : [132 x 1 x *]
+	  W : [132 x 512 x 1] (gradient) }
+	{ LSTMoutput3.output : [512 x 1 x *] (gradient)
+	  LSTMoutputW : [132 x 1 x *] (gradient) }
+	{ LSTMoutput3.mt : [1024 x 1 x *] (gradient)
+	  unnamed193 : [132 x *] (gradient) }
+	{ LSTMoutput2.Wcoct : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G4 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.ft : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.bft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.output : [512 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpb : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.it : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.Wh : [4096 x 512] (gradient)
+	  LSTMoutput3.G2 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed176 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput1.bit : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed183 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.bft : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.dc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G1 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.G2 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcfdc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcidc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.unnamed163 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed175 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wcidc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.ft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.bft : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.dc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wcfdc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcidc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.ft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed173 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed185 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wh : [4096 x 512] (gradient)
+	  LSTMoutput2.G2 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcfdc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.wxxpb : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.it : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.output : [512 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpb : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed177 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.whh : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput3.b : [4096 x 1] (gradient)
+	  LSTMoutput3.dh : [512 x 1 x *] (gradient) }
+	{ LSTMoutput1.dh : [512 x 1 x *]
+	  LSTMoutput1.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput1.mt : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.dh : [512 x 1 x *]
+	  LSTMoutput2.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput1.Wco : [1024] (gradient)
+	  LSTMoutput2.dc : [1024 x 1 x *] }
+
+
+08/16/2016 10:02:01: Training 13634692 parameters in 23 out of 23 parameter tensors and 104 nodes with gradient:
+
+08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput1.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 10:02:01: 	Node 'LSTMoutput1.wx' (LearnableParameter operation) : [4096 x 363]
+08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput2.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 10:02:01: 	Node 'LSTMoutput2.wx' (LearnableParameter operation) : [4096 x 512 x 1]
+08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 10:02:01: 	Node 'LSTMoutput3.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 10:02:01: 	Node 'LSTMoutput3.wx' (LearnableParameter operation) : [4096 x 512 x 1]
+08/16/2016 10:02:01: 	Node 'W' (LearnableParameter operation) : [132 x 512 x 1]
+08/16/2016 10:02:01: 	Node 'b' (LearnableParameter operation) : [132 x 1]
+
+
+08/16/2016 10:02:01: Precomputing --> 3 PreCompute nodes found.
+
+08/16/2016 10:02:01: 	featNorm.xMean = Mean()
+08/16/2016 10:02:01: 	featNorm.xStdDev = InvStdDev()
+08/16/2016 10:02:01: 	logPrior.prior = Mean()
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+08/16/2016 10:02:02: Precomputing --> Completed.
+
+
+08/16/2016 10:02:02: Starting Epoch 1: learning rate per sample = 0.001953  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+minibatchiterator: epoch 0: frames [0..64] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+08/16/2016 10:02:03: Starting minibatch loop.
+08/16/2016 10:02:03:  Epoch[ 1 of 1]-Minibatch[   1-  10, 250.00%]: ce = 4.87453079 * 160; err = 0.90625000 * 160; time = 0.5069s; samplesPerSecond = 315.6
+08/16/2016 10:02:03:  Epoch[ 1 of 1]-Minibatch[  11-  20, 500.00%]: ce = 4.84628143 * 160; err = 0.69375000 * 160; time = 0.4852s; samplesPerSecond = 329.8
+08/16/2016 10:02:04: Finished Epoch[ 1 of 1]: [Training] ce = 4.85708837 * 418; err = 0.80382775 * 418; totalSamplesSeen = 418; learningRatePerSample = 0.001953125; epochTime=1.33633s
+08/16/2016 10:02:04: SGD: Saving checkpoint model '/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn'
+08/16/2016 10:02:05: CNTKCommandTrainEnd: speechTrain
+
+08/16/2016 10:02:05: Action "train" complete.
+
+08/16/2016 10:02:05: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.release.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.release.cpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.release.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.release.gpu.txt
@ -1 +0,0 @@
-__COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.windows.cpu.txt
@ -0,0 +1,681 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 268381192 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/LSTM-NDL.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 16 2016 03:09:16
+		Last modified date: Fri Aug 12 05:28:23 2016
+		Build type: Release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+		CUB_PATH: c:\src\cub-1.4.1
+		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
+		Build Branch: HEAD
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by svcphil on Philly-Pool1
+		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+-------------------------------------------------------------------
+Changed current directory to C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+08/16/2016 03:20:22: -------------------------------------------------------------------
+08/16/2016 03:20:22: Build info: 
+
+08/16/2016 03:20:22: 		Built time: Aug 16 2016 03:09:16
+08/16/2016 03:20:22: 		Last modified date: Fri Aug 12 05:28:23 2016
+08/16/2016 03:20:22: 		Build type: Release
+08/16/2016 03:20:22: 		Build target: GPU
+08/16/2016 03:20:22: 		With 1bit-SGD: yes
+08/16/2016 03:20:22: 		Math lib: mkl
+08/16/2016 03:20:22: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/16/2016 03:20:22: 		CUB_PATH: c:\src\cub-1.4.1
+08/16/2016 03:20:22: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
+08/16/2016 03:20:22: 		Build Branch: HEAD
+08/16/2016 03:20:22: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 03:20:22: 		Built by svcphil on Philly-Pool1
+08/16/2016 03:20:22: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+08/16/2016 03:20:22: -------------------------------------------------------------------
+08/16/2016 03:20:23: -------------------------------------------------------------------
+08/16/2016 03:20:23: GPU info:
+
+08/16/2016 03:20:23: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:23: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:23: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:23: -------------------------------------------------------------------
+
+08/16/2016 03:20:23: Running on DPHAIM-25 at 2016/08/16 03:20:23
+08/16/2016 03:20:23: Command line: 
+C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/LSTM-NDL.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  speechTrain=[reader=[useMersenneTwisterRand=true]]  parallelTrain=false
+
+
+
+08/16/2016 03:20:23: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:23: RootDir = ".."
+ConfigDir = "$RootDir$/Config"
+DataDir = "$RootDir$/Data"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = 1
+modelPath = "$ModelDir$/cntkSpeechLSTM.dnn"
+parallelTrain = true
+frameMode = false
+truncated = true
+speechTrain = [
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "$DataDir$/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
+DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
+DeviceId=-1
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=64]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+parallelTrain=false
+
+08/16/2016 03:20:23: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 03:20:23: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:23: RootDir = ".."
+ConfigDir = "../Config"
+DataDir = "../Data"
+OutputDir = "../Output"
+ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = 1
+modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn"
+parallelTrain = true
+frameMode = false
+truncated = true
+speechTrain = [
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf"
+            labelMappingFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
+DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
+DeviceId=-1
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=64]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+parallelTrain=false
+
+08/16/2016 03:20:23: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 03:20:23: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: LSTM-NDL.cntk:command=speechTrain
+configparameters: LSTM-NDL.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
+configparameters: LSTM-NDL.cntk:currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+configparameters: LSTM-NDL.cntk:DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+configparameters: LSTM-NDL.cntk:deviceId=-1
+configparameters: LSTM-NDL.cntk:frameMode=false
+configparameters: LSTM-NDL.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models
+configparameters: LSTM-NDL.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn
+configparameters: LSTM-NDL.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
+configparameters: LSTM-NDL.cntk:parallelTrain=false
+configparameters: LSTM-NDL.cntk:precision=float
+configparameters: LSTM-NDL.cntk:RootDir=..
+configparameters: LSTM-NDL.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
+configparameters: LSTM-NDL.cntk:speechTrain=[
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf"
+            labelMappingFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=64]] [reader=[useMersenneTwisterRand=true]]
+
+configparameters: LSTM-NDL.cntk:timestamping=true
+configparameters: LSTM-NDL.cntk:traceLevel=1
+configparameters: LSTM-NDL.cntk:truncated=true
+08/16/2016 03:20:23: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:23: Commands: speechTrain
+08/16/2016 03:20:23: Precision = "float"
+08/16/2016 03:20:23: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn
+08/16/2016 03:20:23: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 03:20:23: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+
+08/16/2016 03:20:23: ##############################################################################
+08/16/2016 03:20:23: #                                                                            #
+08/16/2016 03:20:23: # Action "train"                                                             #
+08/16/2016 03:20:23: #                                                                            #
+08/16/2016 03:20:23: ##############################################################################
+
+08/16/2016 03:20:23: CNTKCommandTrainBegin: speechTrain
+NDLBuilder Using CPU
+reading script file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp ... 948 entries
+total 132 state names in state list C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list
+htkmlfreader: reading MLF file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+useParallelTrain option is not enabled. ParallelTrain config will be ignored.
+08/16/2016 03:20:24: Creating virgin network.
+Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=4, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=5, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=6, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=9, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=10, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=11, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=12, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=15, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=16, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=17, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=18, range=0.050000*1.000000, onCPU=false).
+Node 'W' (LearnableParameter operation): Initializating Parameter[132 x 0] as uniform later when dimensions are fully known.
+Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+
+Post-processing network...
+
+6 roots:
+	ce = CrossEntropyWithSoftmax()
+	err = ErrorPrediction()
+	featNorm.xMean = Mean()
+	featNorm.xStdDev = InvStdDev()
+	logPrior.prior = Mean()
+	scaledLogLikelihood = Minus()
+
+Loop[0] --> Loop_LSTMoutput1.output -> 24 nodes
+
+	LSTMoutput1.dh	LSTMoutput1.whh	LSTMoutput1.wxxpbpwhh
+	LSTMoutput1.G4	LSTMoutput1.G3	LSTMoutput1.dc
+	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft
+	LSTMoutput1.bft	LSTMoutput1.G1	LSTMoutput1.Wcidc
+	LSTMoutput1.unnamed163	LSTMoutput1.it	LSTMoutput1.G2
+	LSTMoutput1.unnamed164	LSTMoutput1.bit	LSTMoutput1.ct
+	LSTMoutput1.Wcoct	LSTMoutput1.unnamed166	LSTMoutput1.ot
+	LSTMoutput1.unnamed167	LSTMoutput1.mt	LSTMoutput1.output
+
+Loop[1] --> Loop_LSTMoutput2.output -> 24 nodes
+
+	LSTMoutput2.dh	LSTMoutput2.whh	LSTMoutput2.wxxpbpwhh
+	LSTMoutput2.G4	LSTMoutput2.G3	LSTMoutput2.dc
+	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed175	LSTMoutput2.ft
+	LSTMoutput2.bft	LSTMoutput2.G1	LSTMoutput2.Wcidc
+	LSTMoutput2.unnamed173	LSTMoutput2.it	LSTMoutput2.G2
+	LSTMoutput2.unnamed174	LSTMoutput2.bit	LSTMoutput2.ct
+	LSTMoutput2.Wcoct	LSTMoutput2.unnamed176	LSTMoutput2.ot
+	LSTMoutput2.unnamed177	LSTMoutput2.mt	LSTMoutput2.output
+
+Loop[2] --> Loop_LSTMoutput3.output -> 24 nodes
+
+	LSTMoutput3.dh	LSTMoutput3.whh	LSTMoutput3.wxxpbpwhh
+	LSTMoutput3.G4	LSTMoutput3.G3	LSTMoutput3.dc
+	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed185	LSTMoutput3.ft
+	LSTMoutput3.bft	LSTMoutput3.G1	LSTMoutput3.Wcidc
+	LSTMoutput3.unnamed183	LSTMoutput3.it	LSTMoutput3.G2
+	LSTMoutput3.unnamed184	LSTMoutput3.bit	LSTMoutput3.ct
+	LSTMoutput3.Wcoct	LSTMoutput3.unnamed186	LSTMoutput3.ot
+	LSTMoutput3.unnamed187	LSTMoutput3.mt	LSTMoutput3.output
+
+Validating network. 113 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 x *]
+Validating --> W = LearnableParameter() :  -> [132 x 0]
+Validating --> LSTMoutput3.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput3.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput2.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput2.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput1.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput1.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> features = InputValue() :  -> [363 x *]
+Validating --> featNorm.xMean = Mean (features) : [363 x *] -> [363]
+Validating --> featNorm.xStdDev = InvStdDev (features) : [363 x *] -> [363]
+Validating --> featNorm.xNorm = PerDimMeanVarNormalization (features, featNorm.xMean, featNorm.xStdDev) : [363 x *], [363], [363] -> [363 x *]
+Node 'LSTMoutput1.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 363].
+Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializing Parameter[4096 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput1.wxx = Times (LSTMoutput1.wx, featNorm.xNorm) : [4096 x 363], [363 x *] -> [4096 x *]
+Validating --> LSTMoutput1.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput1.wxxpb = Plus (LSTMoutput1.wxx, LSTMoutput1.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput1.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput1.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput1.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput1.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput1.wxxpbpwhh = Plus (LSTMoutput1.wxxpb, LSTMoutput1.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.G4 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G3 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput1.unnamed165 = Plus (LSTMoutput1.G3, LSTMoutput1.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ft = Sigmoid (LSTMoutput1.unnamed165) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.bft = ElementTimes (LSTMoutput1.ft, LSTMoutput1.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G1 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput1.unnamed163 = Plus (LSTMoutput1.G1, LSTMoutput1.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.it = Sigmoid (LSTMoutput1.unnamed163) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G2 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed164 = Tanh (LSTMoutput1.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.bit = ElementTimes (LSTMoutput1.it, LSTMoutput1.unnamed164) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ct = Plus (LSTMoutput1.bft, LSTMoutput1.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcoct = DiagTimes (LSTMoutput1.Wco, LSTMoutput1.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed166 = Plus (LSTMoutput1.G4, LSTMoutput1.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ot = Sigmoid (LSTMoutput1.unnamed166) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed167 = Tanh (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.mt = ElementTimes (LSTMoutput1.ot, LSTMoutput1.unnamed167) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.output = Times (LSTMoutput1.Wmr, LSTMoutput1.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'LSTMoutput2.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
+Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=7, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput2.wxx = Times (LSTMoutput2.wx, LSTMoutput1.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
+Validating --> LSTMoutput2.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput2.wxxpb = Plus (LSTMoutput2.wxx, LSTMoutput2.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput2.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput2.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput2.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput2.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=8, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput2.wxxpbpwhh = Plus (LSTMoutput2.wxxpb, LSTMoutput2.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.G4 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G3 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput2.unnamed175 = Plus (LSTMoutput2.G3, LSTMoutput2.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ft = Sigmoid (LSTMoutput2.unnamed175) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.bft = ElementTimes (LSTMoutput2.ft, LSTMoutput2.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G1 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput2.unnamed173 = Plus (LSTMoutput2.G1, LSTMoutput2.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.it = Sigmoid (LSTMoutput2.unnamed173) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G2 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed174 = Tanh (LSTMoutput2.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.bit = ElementTimes (LSTMoutput2.it, LSTMoutput2.unnamed174) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ct = Plus (LSTMoutput2.bft, LSTMoutput2.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcoct = DiagTimes (LSTMoutput2.Wco, LSTMoutput2.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed176 = Plus (LSTMoutput2.G4, LSTMoutput2.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ot = Sigmoid (LSTMoutput2.unnamed176) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed177 = Tanh (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.mt = ElementTimes (LSTMoutput2.ot, LSTMoutput2.unnamed177) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.output = Times (LSTMoutput2.Wmr, LSTMoutput2.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'LSTMoutput3.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
+Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=13, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput3.wxx = Times (LSTMoutput3.wx, LSTMoutput2.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
+Validating --> LSTMoutput3.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput3.wxxpb = Plus (LSTMoutput3.wxx, LSTMoutput3.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput3.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput3.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput3.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput3.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=14, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput3.wxxpbpwhh = Plus (LSTMoutput3.wxxpb, LSTMoutput3.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.G4 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G3 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput3.unnamed185 = Plus (LSTMoutput3.G3, LSTMoutput3.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ft = Sigmoid (LSTMoutput3.unnamed185) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.bft = ElementTimes (LSTMoutput3.ft, LSTMoutput3.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G1 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput3.unnamed183 = Plus (LSTMoutput3.G1, LSTMoutput3.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.it = Sigmoid (LSTMoutput3.unnamed183) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G2 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed184 = Tanh (LSTMoutput3.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.bit = ElementTimes (LSTMoutput3.it, LSTMoutput3.unnamed184) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ct = Plus (LSTMoutput3.bft, LSTMoutput3.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcoct = DiagTimes (LSTMoutput3.Wco, LSTMoutput3.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed186 = Plus (LSTMoutput3.G4, LSTMoutput3.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ot = Sigmoid (LSTMoutput3.unnamed186) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed187 = Tanh (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.mt = ElementTimes (LSTMoutput3.ot, LSTMoutput3.unnamed187) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.output = Times (LSTMoutput3.Wmr, LSTMoutput3.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'W' (LearnableParameter operation) operation: Tensor shape was inferred as [132 x 512 x 1].
+Node 'W' (LearnableParameter operation): Initializing Parameter[132 x 512 x 1] <- uniform(seed=19, range=0.050000*1.000000, onCPU=false).
+Validating --> unnamed193 = Times (W, LSTMoutput3.output) : [132 x 512 x 1], [512 x 1 x *] -> [132 x *]
+Validating --> b = LearnableParameter() :  -> [132 x 1]
+Validating --> LSTMoutputW = Plus (unnamed193, b) : [132 x *], [132 x 1] -> [132 x 1 x *]
+Validating --> ce = CrossEntropyWithSoftmax (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> logPrior.prior = Mean (labels) : [132 x *] -> [132]
+Validating --> logPrior.logPrior = Log (logPrior.prior) : [132] -> [132]
+Validating --> scaledLogLikelihood = Minus (LSTMoutputW, logPrior.logPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
+
+Validating network. 88 nodes to process in pass 2.
+
+Validating --> LSTMoutput1.dh = PastValue (LSTMoutput1.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.dc = PastValue (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.dh = PastValue (LSTMoutput2.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.dc = PastValue (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.dh = PastValue (LSTMoutput3.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.dc = PastValue (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+
+Validating network. 15 nodes to process in pass 3.
+
+
+Validating network, final pass.
+
+
+
+29 out of 113 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/16/2016 03:20:24: Created model with 113 nodes on CPU.
+
+08/16/2016 03:20:24: Training criterion node(s):
+08/16/2016 03:20:24: 	ce = CrossEntropyWithSoftmax
+
+08/16/2016 03:20:24: Evaluation criterion node(s):
+08/16/2016 03:20:24: 	err = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 217 matrices, 125 are shared as 56, and 92 are not shared.
+
+	{ LSTMoutput1.dh : [512 x 1 x *]
+	  LSTMoutput1.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput2.mt : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.dh : [512 x 1 x *]
+	  LSTMoutput3.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput2.Wco : [1024] (gradient)
+	  LSTMoutput3.dc : [1024 x 1 x *] }
+	{ LSTMoutput1.mt : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.dh : [512 x 1 x *]
+	  LSTMoutput2.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput1.Wco : [1024] (gradient)
+	  LSTMoutput2.dc : [1024 x 1 x *] }
+	{ LSTMoutput1.G3 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcidc : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed164 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcfdc : [1024 x 1 x *] }
+	{ LSTMoutput1.Wci : [1024] (gradient)
+	  LSTMoutput2.G1 : [1024 x 1 x *] }
+	{ LSTMoutput1.wxxpbpwhh : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed175 : [1024 x 1 x *] }
+	{ LSTMoutput2.Wcf : [1024] (gradient)
+	  LSTMoutput3.it : [1024 x 1 x *] }
+	{ LSTMoutput1.ct : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] }
+	{ LSTMoutput3.wx : [4096 x 512 x 1] (gradient)
+	  LSTMoutput3.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput1.Wmr : [512 x 1024] (gradient)
+	  LSTMoutput2.wxx : [4096 x *] }
+	{ LSTMoutput1.Wcoct : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G4 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G4 : [1024 x 1 x *] }
+	{ LSTMoutput1.Wcf : [1024] (gradient)
+	  LSTMoutput2.it : [1024 x 1 x *] }
+	{ LSTMoutput2.unnamed174 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcfdc : [1024 x 1 x *] }
+	{ LSTMoutput1.G1 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.ft : [1024 x 1 x *] }
+	{ LSTMoutput1.dc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G1 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.ft : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed165 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.bft : [1024 x 1 x *] }
+	{ LSTMoutput2.G3 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcidc : [1024 x 1 x *] }
+	{ LSTMoutput1.ot : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.whh : [4096 x 1 x *] }
+	{ LSTMoutput2.ot : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.whh : [4096 x 1 x *] }
+	{ LSTMoutput2.ct : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] }
+	{ LSTMoutput1.whh : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.G2 : [1024 x 1 x *] }
+	{ LSTMoutput2.wx : [4096 x 512 x 1] (gradient)
+	  LSTMoutput2.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput1.b : [4096 x 1] (gradient)
+	  LSTMoutput1.dh : [512 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed174 : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed166 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed185 : [1024 x 1 x *] }
+	{ LSTMoutput2.Wci : [1024] (gradient)
+	  LSTMoutput3.G1 : [1024 x 1 x *] }
+	{ LSTMoutput1.it : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed183 : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed167 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.whh : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.G2 : [1024 x 1 x *] }
+	{ LSTMoutput2.Wmr : [512 x 1024] (gradient)
+	  LSTMoutput3.wxx : [4096 x *] }
+	{ LSTMoutput2.b : [4096 x 1] (gradient)
+	  LSTMoutput2.dh : [512 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed184 : [1024 x 1 x *] }
+	{ LSTMoutput1.G4 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G4 : [1024 x 1 x *] }
+	{ LSTMoutput2.unnamed176 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput1.bit : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed183 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wh : [4096 x 512] (gradient)
+	  LSTMoutput2.G2 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcfdc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.wxxpb : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.it : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed177 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.whh : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput3.output : [512 x 1 x *] (gradient)
+	  LSTMoutputW : [132 x 1 x *] (gradient) }
+	{ LSTMoutput2.bft : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.dc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.Wh : [4096 x 512] (gradient)
+	  LSTMoutput3.G2 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.bft : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.dc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G1 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.unnamed163 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed175 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed173 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed185 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.output : [512 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpb : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput3.b : [4096 x 1] (gradient)
+	  LSTMoutput3.dh : [512 x 1 x *] (gradient) }
+	{ LSTMoutput2.Wcoct : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G4 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wcidc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.ft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.G2 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcfdc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcidc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput3.Wmr : [512 x 1024] (gradient)
+	  unnamed193 : [132 x *] }
+	{ LSTMoutput1.output : [512 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpb : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.it : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput3.mt : [1024 x 1 x *] (gradient)
+	  unnamed193 : [132 x *] (gradient) }
+	{ LSTMoutput1.Wcfdc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcidc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.ft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutputW : [132 x 1 x *]
+	  W : [132 x 512 x 1] (gradient) }
+	{ LSTMoutput1.ft : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.bft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.wx : [4096 x 363] (gradient)
+	  LSTMoutput1.wxxpb : [4096 x 1 x *] }
+
+
+08/16/2016 03:20:24: Training 13634692 parameters in 23 out of 23 parameter tensors and 104 nodes with gradient:
+
+08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput1.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 03:20:24: 	Node 'LSTMoutput1.wx' (LearnableParameter operation) : [4096 x 363]
+08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput2.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 03:20:24: 	Node 'LSTMoutput2.wx' (LearnableParameter operation) : [4096 x 512 x 1]
+08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 03:20:24: 	Node 'LSTMoutput3.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 03:20:24: 	Node 'LSTMoutput3.wx' (LearnableParameter operation) : [4096 x 512 x 1]
+08/16/2016 03:20:24: 	Node 'W' (LearnableParameter operation) : [132 x 512 x 1]
+08/16/2016 03:20:24: 	Node 'b' (LearnableParameter operation) : [132 x 1]
+
+
+08/16/2016 03:20:24: Precomputing --> 3 PreCompute nodes found.
+
+08/16/2016 03:20:24: 	featNorm.xMean = Mean()
+08/16/2016 03:20:24: 	featNorm.xStdDev = InvStdDev()
+08/16/2016 03:20:24: 	logPrior.prior = Mean()
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+08/16/2016 03:20:27: Precomputing --> Completed.
+
+
+08/16/2016 03:20:28: Starting Epoch 1: learning rate per sample = 0.001953  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+minibatchiterator: epoch 0: frames [0..64] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+08/16/2016 03:20:28: Starting minibatch loop.
+08/16/2016 03:20:31:  Epoch[ 1 of 1]-Minibatch[   1-  10, 250.00%]: ce = 4.87950134 * 160; err = 0.90625000 * 160; time = 3.6415s; samplesPerSecond = 43.9
+08/16/2016 03:20:35:  Epoch[ 1 of 1]-Minibatch[  11-  20, 500.00%]: ce = 4.84555817 * 160; err = 0.69375000 * 160; time = 3.6742s; samplesPerSecond = 43.5
+08/16/2016 03:20:38: Finished Epoch[ 1 of 1]: [Training] ce = 4.85900003 * 418; err = 0.80382775 * 418; totalSamplesSeen = 418; learningRatePerSample = 0.001953125; epochTime=9.76851s
+08/16/2016 03:20:38: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn'
+08/16/2016 03:20:39: CNTKCommandTrainEnd: speechTrain
+
+08/16/2016 03:20:39: Action "train" complete.
+
+08/16/2016 03:20:39: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.windows.gpu.txt
@ -0,0 +1,682 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
+    Hardware threads: 24
+    Total Memory: 268381192 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/LSTM-NDL.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 16 2016 03:09:16
+		Last modified date: Fri Aug 12 05:28:23 2016
+		Build type: Release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+		CUB_PATH: c:\src\cub-1.4.1
+		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
+		Build Branch: HEAD
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+		Built by svcphil on Philly-Pool1
+		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+-------------------------------------------------------------------
+Changed current directory to C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+08/16/2016 03:20:41: -------------------------------------------------------------------
+08/16/2016 03:20:41: Build info: 
+
+08/16/2016 03:20:41: 		Built time: Aug 16 2016 03:09:16
+08/16/2016 03:20:41: 		Last modified date: Fri Aug 12 05:28:23 2016
+08/16/2016 03:20:41: 		Build type: Release
+08/16/2016 03:20:41: 		Build target: GPU
+08/16/2016 03:20:41: 		With 1bit-SGD: yes
+08/16/2016 03:20:41: 		Math lib: mkl
+08/16/2016 03:20:41: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/16/2016 03:20:41: 		CUB_PATH: c:\src\cub-1.4.1
+08/16/2016 03:20:41: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
+08/16/2016 03:20:41: 		Build Branch: HEAD
+08/16/2016 03:20:41: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
+08/16/2016 03:20:41: 		Built by svcphil on Philly-Pool1
+08/16/2016 03:20:41: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+08/16/2016 03:20:41: -------------------------------------------------------------------
+08/16/2016 03:20:43: -------------------------------------------------------------------
+08/16/2016 03:20:43: GPU info:
+
+08/16/2016 03:20:43: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:43: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:43: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
+08/16/2016 03:20:43: -------------------------------------------------------------------
+
+08/16/2016 03:20:43: Running on DPHAIM-25 at 2016/08/16 03:20:43
+08/16/2016 03:20:43: Command line: 
+C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/LSTM-NDL.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  speechTrain=[reader=[useMersenneTwisterRand=true]]  parallelTrain=false
+
+
+
+08/16/2016 03:20:43: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:43: RootDir = ".."
+ConfigDir = "$RootDir$/Config"
+DataDir = "$RootDir$/Data"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = 1
+modelPath = "$ModelDir$/cntkSpeechLSTM.dnn"
+parallelTrain = true
+frameMode = false
+truncated = true
+speechTrain = [
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "$DataDir$/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
+DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
+DeviceId=0
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=64]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+parallelTrain=false
+
+08/16/2016 03:20:43: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 03:20:43: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:43: RootDir = ".."
+ConfigDir = "../Config"
+DataDir = "../Data"
+OutputDir = "../Output"
+ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models"
+deviceId = -1
+command = speechTrain
+precision = "float"
+traceLevel = 1
+modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn"
+parallelTrain = true
+frameMode = false
+truncated = true
+speechTrain = [
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf"
+            labelMappingFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
+DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
+DeviceId=0
+timestamping=true
+speechTrain=[SGD=[maxEpochs=1]]
+speechTrain=[SGD=[epochSize=64]]
+speechTrain=[reader=[useMersenneTwisterRand=true]]
+parallelTrain=false
+
+08/16/2016 03:20:43: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/16/2016 03:20:43: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: LSTM-NDL.cntk:command=speechTrain
+configparameters: LSTM-NDL.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
+configparameters: LSTM-NDL.cntk:currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+configparameters: LSTM-NDL.cntk:DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
+configparameters: LSTM-NDL.cntk:deviceId=0
+configparameters: LSTM-NDL.cntk:frameMode=false
+configparameters: LSTM-NDL.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models
+configparameters: LSTM-NDL.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn
+configparameters: LSTM-NDL.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
+configparameters: LSTM-NDL.cntk:parallelTrain=false
+configparameters: LSTM-NDL.cntk:precision=float
+configparameters: LSTM-NDL.cntk:RootDir=..
+configparameters: LSTM-NDL.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
+configparameters: LSTM-NDL.cntk:speechTrain=[
+    action = "train"
+    nbrUttsIneachRecurrentIter = 16
+    NDLNetworkBuilder = [
+        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/lstmp-3layer-opt.ndl"
+    ]
+    SGD = [
+        epochSize = 0
+        minibatchSize = 16
+        learningRatesPerMB = 0.5
+        numMBsToShowResult = 10
+        momentumPerMB = 0:0.9
+        maxEpochs = 4
+        keepCheckPointFiles = true       
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf"
+            labelMappingFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=64]] [reader=[useMersenneTwisterRand=true]]
+
+configparameters: LSTM-NDL.cntk:timestamping=true
+configparameters: LSTM-NDL.cntk:traceLevel=1
+configparameters: LSTM-NDL.cntk:truncated=true
+08/16/2016 03:20:43: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:43: Commands: speechTrain
+08/16/2016 03:20:43: Precision = "float"
+08/16/2016 03:20:43: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn
+08/16/2016 03:20:43: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 03:20:43: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+
+08/16/2016 03:20:43: ##############################################################################
+08/16/2016 03:20:43: #                                                                            #
+08/16/2016 03:20:43: # Action "train"                                                             #
+08/16/2016 03:20:43: #                                                                            #
+08/16/2016 03:20:43: ##############################################################################
+
+08/16/2016 03:20:43: CNTKCommandTrainBegin: speechTrain
+NDLBuilder Using GPU 0
+reading script file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp ... 948 entries
+total 132 state names in state list C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list
+htkmlfreader: reading MLF file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+useParallelTrain option is not enabled. ParallelTrain config will be ignored.
+08/16/2016 03:20:43: Creating virgin network.
+Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
+Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
+Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetUniformRandomValue (GPU): creating curand object with seed 3, sizeof(ElemType)==4
+Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=4, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=5, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=6, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=9, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=10, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=11, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=12, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
+Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
+Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=15, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=16, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=17, range=0.050000*1.000000, onCPU=false).
+Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=18, range=0.050000*1.000000, onCPU=false).
+Node 'W' (LearnableParameter operation): Initializating Parameter[132 x 0] as uniform later when dimensions are fully known.
+Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
+
+Post-processing network...
+
+6 roots:
+	ce = CrossEntropyWithSoftmax()
+	err = ErrorPrediction()
+	featNorm.xMean = Mean()
+	featNorm.xStdDev = InvStdDev()
+	logPrior.prior = Mean()
+	scaledLogLikelihood = Minus()
+
+Loop[0] --> Loop_LSTMoutput1.output -> 24 nodes
+
+	LSTMoutput1.dh	LSTMoutput1.whh	LSTMoutput1.wxxpbpwhh
+	LSTMoutput1.G4	LSTMoutput1.G3	LSTMoutput1.dc
+	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft
+	LSTMoutput1.bft	LSTMoutput1.G1	LSTMoutput1.Wcidc
+	LSTMoutput1.unnamed163	LSTMoutput1.it	LSTMoutput1.G2
+	LSTMoutput1.unnamed164	LSTMoutput1.bit	LSTMoutput1.ct
+	LSTMoutput1.Wcoct	LSTMoutput1.unnamed166	LSTMoutput1.ot
+	LSTMoutput1.unnamed167	LSTMoutput1.mt	LSTMoutput1.output
+
+Loop[1] --> Loop_LSTMoutput2.output -> 24 nodes
+
+	LSTMoutput2.dh	LSTMoutput2.whh	LSTMoutput2.wxxpbpwhh
+	LSTMoutput2.G4	LSTMoutput2.G3	LSTMoutput2.dc
+	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed175	LSTMoutput2.ft
+	LSTMoutput2.bft	LSTMoutput2.G1	LSTMoutput2.Wcidc
+	LSTMoutput2.unnamed173	LSTMoutput2.it	LSTMoutput2.G2
+	LSTMoutput2.unnamed174	LSTMoutput2.bit	LSTMoutput2.ct
+	LSTMoutput2.Wcoct	LSTMoutput2.unnamed176	LSTMoutput2.ot
+	LSTMoutput2.unnamed177	LSTMoutput2.mt	LSTMoutput2.output
+
+Loop[2] --> Loop_LSTMoutput3.output -> 24 nodes
+
+	LSTMoutput3.dh	LSTMoutput3.whh	LSTMoutput3.wxxpbpwhh
+	LSTMoutput3.G4	LSTMoutput3.G3	LSTMoutput3.dc
+	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed185	LSTMoutput3.ft
+	LSTMoutput3.bft	LSTMoutput3.G1	LSTMoutput3.Wcidc
+	LSTMoutput3.unnamed183	LSTMoutput3.it	LSTMoutput3.G2
+	LSTMoutput3.unnamed184	LSTMoutput3.bit	LSTMoutput3.ct
+	LSTMoutput3.Wcoct	LSTMoutput3.unnamed186	LSTMoutput3.ot
+	LSTMoutput3.unnamed187	LSTMoutput3.mt	LSTMoutput3.output
+
+Validating network. 113 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 x *]
+Validating --> W = LearnableParameter() :  -> [132 x 0]
+Validating --> LSTMoutput3.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput3.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput2.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput2.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput1.Wmr = LearnableParameter() :  -> [512 x 1024]
+Validating --> LSTMoutput1.wx = LearnableParameter() :  -> [4096 x 0]
+Validating --> features = InputValue() :  -> [363 x *]
+Validating --> featNorm.xMean = Mean (features) : [363 x *] -> [363]
+Validating --> featNorm.xStdDev = InvStdDev (features) : [363 x *] -> [363]
+Validating --> featNorm.xNorm = PerDimMeanVarNormalization (features, featNorm.xMean, featNorm.xStdDev) : [363 x *], [363], [363] -> [363 x *]
+Node 'LSTMoutput1.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 363].
+Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializing Parameter[4096 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput1.wxx = Times (LSTMoutput1.wx, featNorm.xNorm) : [4096 x 363], [363 x *] -> [4096 x *]
+Validating --> LSTMoutput1.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput1.wxxpb = Plus (LSTMoutput1.wxx, LSTMoutput1.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput1.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput1.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput1.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput1.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput1.wxxpbpwhh = Plus (LSTMoutput1.wxxpb, LSTMoutput1.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.G4 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G3 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput1.unnamed165 = Plus (LSTMoutput1.G3, LSTMoutput1.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ft = Sigmoid (LSTMoutput1.unnamed165) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.bft = ElementTimes (LSTMoutput1.ft, LSTMoutput1.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G1 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput1.unnamed163 = Plus (LSTMoutput1.G1, LSTMoutput1.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.it = Sigmoid (LSTMoutput1.unnamed163) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.G2 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed164 = Tanh (LSTMoutput1.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.bit = ElementTimes (LSTMoutput1.it, LSTMoutput1.unnamed164) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ct = Plus (LSTMoutput1.bft, LSTMoutput1.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcoct = DiagTimes (LSTMoutput1.Wco, LSTMoutput1.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed166 = Plus (LSTMoutput1.G4, LSTMoutput1.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.ot = Sigmoid (LSTMoutput1.unnamed166) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.unnamed167 = Tanh (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.mt = ElementTimes (LSTMoutput1.ot, LSTMoutput1.unnamed167) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.output = Times (LSTMoutput1.Wmr, LSTMoutput1.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'LSTMoutput2.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
+Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=7, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput2.wxx = Times (LSTMoutput2.wx, LSTMoutput1.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
+Validating --> LSTMoutput2.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput2.wxxpb = Plus (LSTMoutput2.wxx, LSTMoutput2.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput2.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput2.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput2.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput2.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=8, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput2.wxxpbpwhh = Plus (LSTMoutput2.wxxpb, LSTMoutput2.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.G4 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G3 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput2.unnamed175 = Plus (LSTMoutput2.G3, LSTMoutput2.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ft = Sigmoid (LSTMoutput2.unnamed175) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.bft = ElementTimes (LSTMoutput2.ft, LSTMoutput2.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G1 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput2.unnamed173 = Plus (LSTMoutput2.G1, LSTMoutput2.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.it = Sigmoid (LSTMoutput2.unnamed173) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.G2 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed174 = Tanh (LSTMoutput2.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.bit = ElementTimes (LSTMoutput2.it, LSTMoutput2.unnamed174) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ct = Plus (LSTMoutput2.bft, LSTMoutput2.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcoct = DiagTimes (LSTMoutput2.Wco, LSTMoutput2.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed176 = Plus (LSTMoutput2.G4, LSTMoutput2.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.ot = Sigmoid (LSTMoutput2.unnamed176) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.unnamed177 = Tanh (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.mt = ElementTimes (LSTMoutput2.ot, LSTMoutput2.unnamed177) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.output = Times (LSTMoutput2.Wmr, LSTMoutput2.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'LSTMoutput3.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
+Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=13, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput3.wxx = Times (LSTMoutput3.wx, LSTMoutput2.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
+Validating --> LSTMoutput3.b = LearnableParameter() :  -> [4096 x 1]
+Validating --> LSTMoutput3.wxxpb = Plus (LSTMoutput3.wxx, LSTMoutput3.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.Wh = LearnableParameter() :  -> [4096 x 0]
+Validating --> LSTMoutput3.Wco = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput3.Wcf = LearnableParameter() :  -> [1024]
+Validating --> LSTMoutput3.Wci = LearnableParameter() :  -> [1024]
+Node 'LSTMoutput3.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
+Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=14, range=0.050000*1.000000, onCPU=false).
+Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512] -> [4096]
+Validating --> LSTMoutput3.wxxpbpwhh = Plus (LSTMoutput3.wxxpb, LSTMoutput3.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.G4 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G3 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput3.unnamed185 = Plus (LSTMoutput3.G3, LSTMoutput3.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ft = Sigmoid (LSTMoutput3.unnamed185) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.bft = ElementTimes (LSTMoutput3.ft, LSTMoutput3.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G1 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024] -> [1024]
+Validating --> LSTMoutput3.unnamed183 = Plus (LSTMoutput3.G1, LSTMoutput3.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.it = Sigmoid (LSTMoutput3.unnamed183) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.G2 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed184 = Tanh (LSTMoutput3.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.bit = ElementTimes (LSTMoutput3.it, LSTMoutput3.unnamed184) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ct = Plus (LSTMoutput3.bft, LSTMoutput3.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcoct = DiagTimes (LSTMoutput3.Wco, LSTMoutput3.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed186 = Plus (LSTMoutput3.G4, LSTMoutput3.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.ot = Sigmoid (LSTMoutput3.unnamed186) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.unnamed187 = Tanh (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.mt = ElementTimes (LSTMoutput3.ot, LSTMoutput3.unnamed187) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.output = Times (LSTMoutput3.Wmr, LSTMoutput3.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
+Node 'W' (LearnableParameter operation) operation: Tensor shape was inferred as [132 x 512 x 1].
+Node 'W' (LearnableParameter operation): Initializing Parameter[132 x 512 x 1] <- uniform(seed=19, range=0.050000*1.000000, onCPU=false).
+Validating --> unnamed193 = Times (W, LSTMoutput3.output) : [132 x 512 x 1], [512 x 1 x *] -> [132 x *]
+Validating --> b = LearnableParameter() :  -> [132 x 1]
+Validating --> LSTMoutputW = Plus (unnamed193, b) : [132 x *], [132 x 1] -> [132 x 1 x *]
+Validating --> ce = CrossEntropyWithSoftmax (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> logPrior.prior = Mean (labels) : [132 x *] -> [132]
+Validating --> logPrior.logPrior = Log (logPrior.prior) : [132] -> [132]
+Validating --> scaledLogLikelihood = Minus (LSTMoutputW, logPrior.logPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
+
+Validating network. 88 nodes to process in pass 2.
+
+Validating --> LSTMoutput1.dh = PastValue (LSTMoutput1.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput1.dc = PastValue (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.dh = PastValue (LSTMoutput2.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput2.dc = PastValue (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.dh = PastValue (LSTMoutput3.output) : [512 x 1 x *] -> [512 x 1 x *]
+Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
+Validating --> LSTMoutput3.dc = PastValue (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
+
+Validating network. 15 nodes to process in pass 3.
+
+
+Validating network, final pass.
+
+
+
+29 out of 113 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/16/2016 03:20:44: Created model with 113 nodes on GPU 0.
+
+08/16/2016 03:20:44: Training criterion node(s):
+08/16/2016 03:20:44: 	ce = CrossEntropyWithSoftmax
+
+08/16/2016 03:20:44: Evaluation criterion node(s):
+08/16/2016 03:20:44: 	err = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 217 matrices, 125 are shared as 56, and 92 are not shared.
+
+	{ LSTMoutput2.mt : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.dh : [512 x 1 x *]
+	  LSTMoutput3.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput2.Wco : [1024] (gradient)
+	  LSTMoutput3.dc : [1024 x 1 x *] }
+	{ LSTMoutput1.dh : [512 x 1 x *]
+	  LSTMoutput1.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput1.mt : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.dh : [512 x 1 x *]
+	  LSTMoutput2.wxx : [4096 x *] (gradient) }
+	{ LSTMoutput1.Wco : [1024] (gradient)
+	  LSTMoutput2.dc : [1024 x 1 x *] }
+	{ LSTMoutput3.b : [4096 x 1] (gradient)
+	  LSTMoutput3.dh : [512 x 1 x *] (gradient) }
+	{ LSTMoutput1.bft : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.dc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G1 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.G2 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcfdc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcidc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.unnamed163 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed175 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wh : [4096 x 512] (gradient)
+	  LSTMoutput2.G2 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcfdc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.bft : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.dc : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed173 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed185 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.unnamed177 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.whh : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wcidc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.ft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.Wcfdc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcidc : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.ft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.wxxpb : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.it : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.output : [512 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpb : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput2.wx : [4096 x 512 x 1] (gradient)
+	  LSTMoutput2.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput1.ct : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] }
+	{ LSTMoutput1.unnamed164 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcfdc : [1024 x 1 x *] }
+	{ LSTMoutput1.G1 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.ft : [1024 x 1 x *] }
+	{ LSTMoutput1.Wci : [1024] (gradient)
+	  LSTMoutput2.G1 : [1024 x 1 x *] }
+	{ LSTMoutput1.Wcf : [1024] (gradient)
+	  LSTMoutput2.it : [1024 x 1 x *] }
+	{ LSTMoutput1.ot : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.whh : [4096 x 1 x *] }
+	{ LSTMoutput1.G4 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G4 : [1024 x 1 x *] }
+	{ LSTMoutput1.Wmr : [512 x 1024] (gradient)
+	  LSTMoutput2.wxx : [4096 x *] }
+	{ LSTMoutput1.G3 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.Wcidc : [1024 x 1 x *] }
+	{ LSTMoutput1.whh : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.G2 : [1024 x 1 x *] }
+	{ LSTMoutput1.b : [4096 x 1] (gradient)
+	  LSTMoutput1.dh : [512 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed174 : [1024 x 1 x *] }
+	{ LSTMoutput2.Wmr : [512 x 1024] (gradient)
+	  LSTMoutput3.wxx : [4096 x *] }
+	{ LSTMoutput1.wxxpbpwhh : [4096 x 1 x *] (gradient)
+	  LSTMoutput2.unnamed175 : [1024 x 1 x *] }
+	{ LSTMoutput1.wx : [4096 x 363] (gradient)
+	  LSTMoutput1.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput2.unnamed174 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcfdc : [1024 x 1 x *] }
+	{ LSTMoutput2.G3 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.Wcidc : [1024 x 1 x *] }
+	{ LSTMoutput2.Wcoct : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G4 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput2.b : [4096 x 1] (gradient)
+	  LSTMoutput2.dh : [512 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed184 : [1024 x 1 x *] }
+	{ LSTMoutput3.output : [512 x 1 x *] (gradient)
+	  LSTMoutputW : [132 x 1 x *] (gradient) }
+	{ LSTMoutput1.ft : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.bft : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.output : [512 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpb : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.it : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.unnamed167 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.whh : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.G2 : [1024 x 1 x *] }
+	{ LSTMoutput1.unnamed166 : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed185 : [1024 x 1 x *] }
+	{ LSTMoutput2.unnamed176 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] (gradient) }
+	{ LSTMoutput3.wx : [4096 x 512 x 1] (gradient)
+	  LSTMoutput3.wxxpb : [4096 x 1 x *] }
+	{ LSTMoutput2.ct : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] }
+	{ LSTMoutput2.ot : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.whh : [4096 x 1 x *] }
+	{ LSTMoutput3.mt : [1024 x 1 x *] (gradient)
+	  unnamed193 : [132 x *] (gradient) }
+	{ LSTMoutput2.Wh : [4096 x 512] (gradient)
+	  LSTMoutput3.G2 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput1.bit : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed183 : [1024 x 1 x *] (gradient) }
+	{ LSTMoutput3.Wmr : [512 x 1024] (gradient)
+	  unnamed193 : [132 x *] }
+	{ LSTMoutput1.unnamed165 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.bft : [1024 x 1 x *] }
+	{ LSTMoutputW : [132 x 1 x *]
+	  W : [132 x 512 x 1] (gradient) }
+	{ LSTMoutput2.Wci : [1024] (gradient)
+	  LSTMoutput3.G1 : [1024 x 1 x *] }
+	{ LSTMoutput1.dc : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G1 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.ft : [1024 x 1 x *] }
+	{ LSTMoutput2.Wcf : [1024] (gradient)
+	  LSTMoutput3.it : [1024 x 1 x *] }
+	{ LSTMoutput1.it : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.unnamed183 : [1024 x 1 x *] }
+	{ LSTMoutput1.Wcoct : [1024 x 1 x *] (gradient)
+	  LSTMoutput2.G4 : [1024 x 1 x *] (gradient)
+	  LSTMoutput3.G4 : [1024 x 1 x *] }
+
+
+08/16/2016 03:20:44: Training 13634692 parameters in 23 out of 23 parameter tensors and 104 nodes with gradient:
+
+08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput1.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 03:20:44: 	Node 'LSTMoutput1.wx' (LearnableParameter operation) : [4096 x 363]
+08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput2.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 03:20:44: 	Node 'LSTMoutput2.wx' (LearnableParameter operation) : [4096 x 512 x 1]
+08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wcf' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wci' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wco' (LearnableParameter operation) : [1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wh' (LearnableParameter operation) : [4096 x 512]
+08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wmr' (LearnableParameter operation) : [512 x 1024]
+08/16/2016 03:20:44: 	Node 'LSTMoutput3.b' (LearnableParameter operation) : [4096 x 1]
+08/16/2016 03:20:44: 	Node 'LSTMoutput3.wx' (LearnableParameter operation) : [4096 x 512 x 1]
+08/16/2016 03:20:44: 	Node 'W' (LearnableParameter operation) : [132 x 512 x 1]
+08/16/2016 03:20:44: 	Node 'b' (LearnableParameter operation) : [132 x 1]
+
+
+08/16/2016 03:20:44: Precomputing --> 3 PreCompute nodes found.
+
+08/16/2016 03:20:44: 	featNorm.xMean = Mean()
+08/16/2016 03:20:44: 	featNorm.xStdDev = InvStdDev()
+08/16/2016 03:20:44: 	logPrior.prior = Mean()
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+08/16/2016 03:20:45: Precomputing --> Completed.
+
+
+08/16/2016 03:20:46: Starting Epoch 1: learning rate per sample = 0.001953  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+minibatchiterator: epoch 0: frames [0..64] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+08/16/2016 03:20:46: Starting minibatch loop.
+08/16/2016 03:20:47:  Epoch[ 1 of 1]-Minibatch[   1-  10, 250.00%]: ce = 4.87453079 * 160; err = 0.90625000 * 160; time = 1.1338s; samplesPerSecond = 141.1
+08/16/2016 03:20:48:  Epoch[ 1 of 1]-Minibatch[  11-  20, 500.00%]: ce = 4.84628143 * 160; err = 0.69375000 * 160; time = 1.0409s; samplesPerSecond = 153.7
+08/16/2016 03:20:49: Finished Epoch[ 1 of 1]: [Training] ce = 4.85708837 * 418; err = 0.80382775 * 418; totalSamplesSeen = 418; learningRatePerSample = 0.001953125; epochTime=2.90303s
+08/16/2016 03:20:50: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn'
+08/16/2016 03:20:51: CNTKCommandTrainEnd: speechTrain
+
+08/16/2016 03:20:51: Action "train" complete.
+
+08/16/2016 03:20:51: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/run-test
@ -5,5 +5,5 @@
 ConfigDir=$TEST_DIR/../../../../../../Examples/Speech/AN4/Config

 # cntkrun <CNTK config file name> <additional CNTK args>
-cntkrun LSTM-NDL.cntk "speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] parallelTrain=false" || exit $?
+cntkrun LSTM-NDL.cntk "speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false" || exit $?

--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/baseline.linux.txt
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/run-test
@ -6,4 +6,4 @@
 . $TEST_DIR/../run-timit-test-common

 # cntkrun <CNTK config file name> <additional CNTK arg>
-cntkrun TIMIT_AdaptLearnRate.cntk "$CntkArguments" || exit $?
+cntkrun TIMIT_AdaptLearnRate.cntk "$CntkArguments TIMIT_TrainAdaptLR=[reader=[useMersenneTwisterRand=true]] TIMIT_TrainAdaptLR=[cvReader=[useMersenneTwisterRand=true]]" || exit $?
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/baseline.linux.txt
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
@ -6,7 +6,7 @@
 . $TEST_DIR/../run-timit-test-common

 # Train:
-cntkrun TIMIT_TrainSimpleNetwork.cntk "$CntkArguments" || exit $?
+cntkrun TIMIT_TrainSimpleNetwork.cntk "$CntkArguments TIMIT_TrainSimple=[reader=[useMersenneTwisterRand=true]]" || exit $?

 # Validate:
-cntkrun TIMIT_CrossValidateSimpleNetwork.cntk "$CntkArguments" || exit $?
+cntkrun TIMIT_CrossValidateSimpleNetwork.cntk "$CntkArguments" || exit $?
--- a/Показать больше
+++ b/Показать больше