merged from master. Undid the ClassificationError baseline updates due to merge conflicts

2016-08-22 14:36:28 -07:00 · 2016-08-22 14:36:28 -07:00 · 5b969bac70
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -34,48 +34,29 @@
    <UseZip Condition="Exists('$(ZLIB_PATH)')">true</UseZip>
  </PropertyGroup>
-  <Choose>
+  <PropertyGroup>
-    <When Condition="Exists('$(ACML_PATH)')">
+	<MathLibrary>MKL</MathLibrary>
-      <PropertyGroup>
+	<CNTKCustomMKLVersion>1</CNTKCustomMKLVersion>
-        <MathLibrary>ACML</MathLibrary>
+	<CNTKCustomMKLPath>$(CNTK_MKL_PATH)\$(CNTKCustomMKLVersion)</CNTKCustomMKLPath>
-        <MathLibraryName>ACML</MathLibraryName>
+	<MathIncludePath>$(CNTKCustomMKLPath)\include</MathIncludePath>
-        <MathIncludePath>$(ACML_PATH)\include</MathIncludePath>
+	<MathDefine>USE_MKL</MathDefine>
-        <MathLibraryPath>$(ACML_PATH)\lib</MathLibraryPath>
+  </PropertyGroup>
-        <MathLinkLibrary>libacml_mp_dll.lib</MathLinkLibrary>
+  <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' != '1'">
-        <MathDelayLoad>libacml_mp_dll.dll</MathDelayLoad>
+	<MathLibraryName>CNTK custom MKL Parallel (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
-        <MathPostBuildCopyPattern>$(ACML_PATH)\lib\*.dll</MathPostBuildCopyPattern>
+	<MathLibraryPath>$(CNTKCustomMKLPath)\x64\parallel</MathLibraryPath>
-        <UnitTestDlls>$(OutDir)libacml_mp_dll.dll;$(OutDir)libifcoremd.dll;$(OutDir)libifportmd.dll;$(OutDir)libiomp*.dll;$(OutDir)libmmd.dll;$(OutDir)svml_dispmd.dll;</UnitTestDlls>
+	<MathLinkLibrary>mkl_cntk_p.lib</MathLinkLibrary>
-        <MathDefine>USE_ACML</MathDefine>
+	<MathDelayLoad>mkl_cntk_p.dll</MathDelayLoad>
-      </PropertyGroup>
+	<MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
-    </When>
+	<UnitTestDlls>$(OutDir)mkl_cntk_p.dll;$(OutDir)libiomp5md.dll;</UnitTestDlls>
-
+  </PropertyGroup>
-    <!-- See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl on how to configure to build CNTK with MKL -->
+  <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' == '1'">
-    <When Condition="'$(CNTK_MKL)' == '1'">
+	<MathLibraryName>CNTK custom MKL Sequential (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
-      <PropertyGroup>
+	<MathLibraryPath>$(CNTKCustomMKLPath)\x64\sequential</MathLibraryPath>
-        <MathLibrary>MKL</MathLibrary>
+	<MathLinkLibrary>mkl_cntk_s.lib</MathLinkLibrary>
-        <CNTKCustomMKLVersion>1</CNTKCustomMKLVersion>
+	<MathDelayLoad>mkl_cntk_s.dll</MathDelayLoad>
-        <CNTKCustomMKLPath>$(CNTK_MKL_PATH)\$(CNTKCustomMKLVersion)</CNTKCustomMKLPath>
+	<MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
-        <MathIncludePath>$(CNTKCustomMKLPath)\include</MathIncludePath>
+	<UnitTestDlls>$(OutDir)mkl_cntk_s.dll;</UnitTestDlls>
-        <MathDefine>USE_MKL</MathDefine>
+  </PropertyGroup>
      </PropertyGroup>
      <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' != '1'">
        <MathLibraryName>CNTK custom MKL Parallel (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
        <MathLibraryPath>$(CNTKCustomMKLPath)\x64\parallel</MathLibraryPath>
        <MathLinkLibrary>mkl_cntk_p.lib</MathLinkLibrary>
        <MathDelayLoad>mkl_cntk_p.dll</MathDelayLoad>
        <MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
        <UnitTestDlls>$(OutDir)mkl_cntk_p.dll;$(OutDir)libiomp5md.dll;</UnitTestDlls>
      </PropertyGroup>
      <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' == '1'">
        <MathLibraryName>CNTK custom MKL Sequential (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
        <MathLibraryPath>$(CNTKCustomMKLPath)\x64\sequential</MathLibraryPath>
        <MathLinkLibrary>mkl_cntk_s.lib</MathLinkLibrary>
        <MathDelayLoad>mkl_cntk_s.dll</MathDelayLoad>
        <MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
        <UnitTestDlls>$(OutDir)mkl_cntk_s.dll;</UnitTestDlls>
      </PropertyGroup>
    </When>
   </Choose>
  <PropertyGroup Condition="$(UseZip)">
    <ZipInclude>$(ZLIB_PATH)\include;$(ZLIB_PATH)\lib\libzip\include;</ZipInclude>
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1150,6 +1150,9 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Examples\E
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BrainScriptTests", "Tests\UnitTests\BrainScriptTests\BrainScriptTests.vcxproj", "{9F999212-AFC5-4EAC-AA78-F7247D46C456}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
 		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
 		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
--- a/31
+++ b/31
@ -9,8 +9,6 @@
 # that provides
 #   BUILDTYPE= One of release or debug
 #     defaults to release
 #   ACML_PATH= path to ACML library installation
 #     only needed if MATHLIB=acml
 #   MKL_PATH= path to CNTK custom MKL installation
 #     only needed if MATHLIB=mkl
 #   CNTK_CUSTOM_MKL_VERSION=2
@ -21,8 +19,8 @@
 #     defaults to /usr/include/nvidia/gdk
 #   GDK_NVML_LIB_PATH= path to CUDA GDK (stub) library path, so $(GDK_NVML_LIB_PATH)/libnvidia-ml.so exists
 #     defaults to /usr/src/gdk/nvml/lib
-#   MATHLIB= One of acml or mkl
+#   MATHLIB= mkl
-#     defaults to acml
+#     defaults to mkl
 #   CUDA_PATH= Path to CUDA
 #     If not specified, GPU will not be enabled
 #   CUB_PATH= path to NVIDIA CUB installation, so $(CUB_PATH)/cub/cub.cuh exists
@ -60,8 +58,8 @@ BUILDTYPE=release
 endif
 ifndef MATHLIB
-$(info DEFAULTING MATHLIB=acml)
+$(info DEFAULTING MATHLIB=mkl)
-MATHLIB = acml
+MATHLIB = mkl
 endif
 #### Configure based on options above
@ -137,13 +135,6 @@ else
  COMMON_FLAGS +=-DCPUONLY
 endif
 ifeq ("$(MATHLIB)","acml")
  INCLUDEPATH += $(ACML_PATH)/include
  LIBPATH += $(ACML_PATH)/lib
  LIBS += -lacml_mp -liomp5 -lm -lpthread
  COMMON_FLAGS += -DUSE_ACML
 endif
 ifeq ("$(MATHLIB)","mkl")
  INCLUDEPATH += $(MKL_PATH)/$(CNTK_CUSTOM_MKL_VERSION)/include
  LIBS += -lm
@ -418,6 +409,10 @@ CNTKLIBRARY_TESTS_SRC =\
 	Tests/UnitTests/V2LibraryTests/TensorTests.cpp \
 	Tests/UnitTests/V2LibraryTests/TrainerTests.cpp \
 	Tests/UnitTests/V2LibraryTests/CifarResNet.cpp \
 	Tests/UnitTests/V2LibraryTests/SerializationTests.cpp \
 	Tests/UnitTests/V2LibraryTests/LearnerTests.cpp \
 	Tests/UnitTests/V2LibraryTests/FunctionTests.cpp \
 	Tests/UnitTests/V2LibraryTests/SequenceClassification.cpp \
 CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
 CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_TESTS_SRC)))
@ -933,22 +928,24 @@ UNITTEST_BRAINSCRIPT_SRC = \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/ParserTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/ComputationNetworkTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/stdafx.cpp
-UNITTEST_BRAINSCRIPT_SRC+=$(COMMON_SRC)
+UNITTEST_BRAINSCRIPT_SRC += $(COMPUTATION_NETWORK_LIB_SRC)
 UNITTEST_BRAINSCRIPT_SRC += $(SEQUENCE_TRAINING_LIB_SRC)
-UNITTEST_BRAINSCRIPT_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_BRAINSCRIPT_SRC))
+UNITTEST_BRAINSCRIPT_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_BRAINSCRIPT_SRC)))
 UNITTEST_BRAINSCRIPT := $(BINDIR)/brainscripttests
 ALL += $(UNITTEST_BRAINSCRIPT)
 SRC += $(UNITTEST_BRAINSCRIPT_SRC)
-$(UNITTEST_BRAINSCRIPT): $(UNITTEST_BRAINSCRIPT_OBJ)
+$(UNITTEST_BRAINSCRIPT): $(UNITTEST_BRAINSCRIPT_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -ldl
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -ldl -l$(CNTKMATH)
 unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH) $(UNITTEST_BRAINSCRIPT)
--- a/Source/ActionsLib/TrainActions.cpp
+++ b/Source/ActionsLib/TrainActions.cpp
@ -72,18 +72,6 @@ void DoTrain(const ConfigRecordType& config)
    bool makeMode = config(L"makeMode", true);
    DEVICEID_TYPE deviceId = DeviceFromConfig(config);
    // determine the network-creation function
    // We have several ways to create that network.
    function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn;
    createNetworkFn = GetNetworkFactory<ConfigRecordType, ElemType>(config);
    auto dataReader = CreateObject<DataReader>(config, L"reader");
    shared_ptr<DataReader> cvDataReader;
    if (config.Exists(L"cvReader"))
        cvDataReader = CreateObject<DataReader>(config, L"cvReader");
    shared_ptr<SGD<ElemType>> optimizer;
    if (config.Exists(L"optimizer"))
    {
@ -95,8 +83,39 @@ void DoTrain(const ConfigRecordType& config)
        optimizer = make_shared<SGD<ElemType>>(configSGD);
    }
    // determine which epoch to start with, including recovering a checkpoint if any and 'makeMode' enabled
    int startEpoch = optimizer->DetermineStartEpoch(makeMode);
    if (startEpoch == optimizer->GetMaxEpochs())
    {
        LOGPRINTF(stderr, "No further training is necessary.\n");
        return;
    }
    wstring modelFileName = optimizer->GetModelNameForEpoch(int(startEpoch) - 1);
    bool loadNetworkFromCheckpoint = startEpoch >= 0;
    fprintf(stderr, "\n");
    if (loadNetworkFromCheckpoint)
        LOGPRINTF(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
    else
        LOGPRINTF(stderr, "Creating virgin network.\n");
    // determine the network-creation function
    // We have several ways to create that network.
    function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn;
    createNetworkFn = GetNetworkFactory<ConfigRecordType, ElemType>(config);
    // create or load from checkpoint
    shared_ptr<ComputationNetwork> net = !loadNetworkFromCheckpoint ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
    auto dataReader = CreateObject<DataReader>(config, L"reader");
    shared_ptr<DataReader> cvDataReader;
    if (config.Exists(L"cvReader"))
        cvDataReader = CreateObject<DataReader>(config, L"cvReader");
    optimizer->InitMPI(MPIWrapper::GetInstance());
-    optimizer->Train(createNetworkFn, deviceId, dataReader.get(), cvDataReader.get(), makeMode);
+    optimizer->Train(net, deviceId, dataReader.get(), cvDataReader.get(), startEpoch, loadNetworkFromCheckpoint);
 }
 namespace Microsoft { namespace MSR { namespace ScriptableObjects {
@ -189,9 +208,8 @@ void DoDumpNodes(const ConfigParameters& config)
    if (!printValues && !printMetadata)
        InvalidArgument("printValues and printMetadata: Since both are set to false, there will be nothing to dump");
-    ComputationNetwork net(CPUDEVICE);    // always use CPU
+    ComputationNetworkPtr net = ComputationNetwork::CreateFromFile<ElemType>(CPUDEVICE, modelPath);
-    net.Load<ElemType>(modelPath); // TODO: we have a function now to combine this and the previous line
+    net->DumpNodeInfoToFile(nodeName, printValues, printMetadata, outputFile, nodeNameRegexStr);
    net.DumpNodeInfoToFile(nodeName, printValues, printMetadata, outputFile, nodeNameRegexStr);
 }
 template void DoDumpNodes<float>(const ConfigParameters& config);
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -10,7 +10,7 @@
 #include "stdafx.h"
 #ifdef _WIN32
 #include <crtdbg.h>
-#endif
+#endif 
 #include "Basics.h"
 #include "Actions.h"
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@ -53,8 +53,6 @@ if "%p_CNTK_MKL%" == "1" (
  ) else (
    echo #define _MATHLIB_ "mkl">> buildinfo.h$$
  )
 ) else (
  echo #define _MATHLIB_ "acml">> buildinfo.h$$
 )
 echo #define _BUILDER_ "%USERNAME%"     >> buildinfo.h$$
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -7,6 +7,12 @@
 #pragma once
 #ifdef SWIG
 #define final
 #define explicit
 #define static_assert(condition, message)
 #endif
 #include "CNTKLibraryInternals.h"
 #include <memory>
@ -14,10 +20,12 @@
 #include <array>
 #include <stdarg.h>
 #include <assert.h>
 #include <map>
 #include <unordered_map>
 #include <unordered_set>
 #include <string>
 #include <sstream>
 #include <iosfwd>
 #include<algorithm>
 namespace CNTK
@ -236,7 +244,7 @@ namespace CNTK
        }
        ///
-        /// Creates and returns a new shape contructed by appending the dimensions of the specified 'shape' to 'this' shape's dimensions.
+        /// Creates and returns a new shape constructed by appending the dimensions of the specified 'shape' to 'this' shape's dimensions.
        ///
        NDShape AppendShape(const NDShape& shape) const
        {
@ -665,35 +673,52 @@ namespace CNTK
    ///
    /// Denotes an Axis of a Variable and is used for specifying the axes parameters of certain Functions such as reductions.
-    /// Besides the static axes corresponding to each of the axes of the Variable's shape, Input and Output Variables
+    /// Besides the static axes corresponding to each of the axes of the Variable's shape, Variables of kind 'Input' and any 
-    /// also have one or more dynamic axes (corresponding to the sequence dimensions) and one implicit batch axis denoting the axes 
+    /// 'Output' Variables dependent on an 'Input' Variable also have 2 additional dynamic axes whose dimensions are known only 
-    /// along which multiple sequences are batched in the Values corresponding to the variable when performing computations.
+    /// when the Variable is bound to actual data during compute (viz. sequence axis and batch axis denoting the axis along which
    /// multiple sequences are batched)
    ///
    class Axis final
    {
        CNTK_API static const std::wstring s_staticAxisNamePrefix;
    public:
        ///
        /// Construct an Axis object denoting a static axis with the specified index.
        ///
-        Axis(size_t staticAxisIdx)
+        explicit Axis(size_t staticAxisIdx)
            : m_staticAxisIdx(staticAxisIdx)
        {
-            const wchar_t* staticAxisNamePrefix = L"staticAxis_";
+            m_name = s_staticAxisNamePrefix + std::to_wstring(staticAxisIdx);
            m_name = staticAxisNamePrefix + std::to_wstring(staticAxisIdx);
        }
        ///
        /// Construct a dynamic axis with the specified name.
        ///
-        Axis(const std::wstring& name)
+        explicit Axis(const std::wstring& name)
            : m_staticAxisIdx(SIZE_MAX), m_name(name)
        {
            if (m_name.length() > s_staticAxisNamePrefix.length())
            {
                auto prefix = m_name.substr(0, s_staticAxisNamePrefix.length());
                auto suffix = m_name.substr(s_staticAxisNamePrefix.length(), m_name.length() - s_staticAxisNamePrefix.length());
                if (prefix == s_staticAxisNamePrefix)
                {
                    if (suffix == L"0")
                        *this = Axis(0);
                    else
                    {
                        auto suffixVal = std::stoul(suffix);
                        if (suffixVal != 0)
                            *this = Axis(suffixVal);
                    }
                }
            }
        }
        ///
        /// Returns a boolean indicating if 'this' Axis corresponds to a static axis
        ///
-        bool IsStaticAxis() const { return m_staticAxisIdx == SIZE_MAX; }
+        bool IsStaticAxis() const { return m_staticAxisIdx != SIZE_MAX; }
        ///
        /// Returns the axis index if 'this' Axis is a static axis. Throws an exception otherwise.
@ -714,12 +739,7 @@ namespace CNTK
        ///
        /// Static Axis object representing the batch axis.
        ///
-        CNTK_API static const Axis& BatchAxis();
+        CNTK_API static const Axis& DefaultBatchAxis();
        ///
        /// Special Axis object denoting all the axes of the Value object in whose context it is used.
        ///
        CNTK_API static const Axis& AllAxes();
        ///
        /// Name of 'this' axis
@ -753,7 +773,20 @@ namespace CNTK
    {
        return !(first == second);
    }
 }
 namespace std {
    template <> struct hash<CNTK::Axis>
    {
        size_t operator()(const CNTK::Axis& x) const
        {
            return std::hash<std::wstring>()(x.Name());
        }
    };
 }
 namespace CNTK
 {
    ///
    /// Enumeration type denoting the kind of a symbolic Variable object
    ///
@ -780,47 +813,76 @@ namespace CNTK
        template <typename T>
        friend struct std::hash;
        CNTK_API static const std::vector<Axis> s_defaultInputVariableDynamicAxes;
    public:
        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType)
+        Variable(const NDShape& shape, CNTK::DataType dataType, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
-            : Variable(shape, dataType, L"")
+            : Variable(shape, dataType, L"", dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name)
+        Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
-            : Variable(shape, dataType, std::wstring(name))
+            : Variable(shape, dataType, std::wstring(name), dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name)
+        Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
-            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, false, { Axis::DefaultDynamicAxis() }, false, name)
+            : Variable(shape, false, dataType, name, dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable denoting sparse data.
        ///
-        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, const std::wstring& name = L"")
+        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
-            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, false, { Axis::DefaultDynamicAxis() }, isSparse, name)
+            : Variable(shape, isSparse, dataType, false, L"", dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable denoting sparse data.
        ///
        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, const wchar_t* name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
            : Variable(shape, isSparse, dataType, std::wstring(name), dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable denoting sparse data.
        ///
        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, const std::wstring& name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
            : Variable(shape, isSparse, dataType, false, name, dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable and specify if gradients are to be computed for this input
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, bool needsGradient, const std::wstring& name = L"")
+        Variable(const NDShape& shape, CNTK::DataType dataType, bool needsGradient, const wchar_t* name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
-            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, needsGradient, { Axis::DefaultDynamicAxis() }, false, name)
+            : Variable(shape, dataType, needsGradient, std::wstring(name), dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable and specify if gradients are to be computed for this input
        ///
        Variable(const NDShape& shape, CNTK::DataType dataType, bool needsGradient, const std::wstring& name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
            : Variable(shape, false, dataType, needsGradient, name, dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable denoting sparse data and specify if gradients are to be computed for this input
        ///
-        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, bool needsGradient, const std::wstring& name = L"")
+        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, bool needsGradient, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
-            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, needsGradient, { Axis::DefaultDynamicAxis() }, isSparse, name)
+            : Variable(shape, isSparse, dataType, needsGradient, L"", dynamicAxes)
        {}
        ///
        /// Create an 'Input' Variable denoting sparse data and specify if gradients are to be computed for this input
        ///
        Variable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, bool needsGradient, const std::wstring& name, const std::vector<Axis>& dynamicAxes = s_defaultInputVariableDynamicAxes)
            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, needsGradient, dynamicAxes, isSparse, name)
        {}
        ///
@ -860,7 +922,7 @@ namespace CNTK
        ///
        /// Returns a boolean value indicating if 'this' variable denotes sparse data
        ///
-        bool IsSparse() const { return (m_dataFields->m_isSparse); }
+        bool IsSparse() const { return m_dataFields->m_isSparse; }
        ///
        /// Returns a boolean value indicating if 'this' variable is an Input
@ -941,6 +1003,14 @@ namespace CNTK
            VariableFields(const NDShape& shape, VariableKind varType, CNTK::DataType type, Function* ownerFunction, const NDArrayViewPtr& value, bool needsGradient, const std::vector<Axis>& dynamicAxes, bool isSparse, const std::wstring& name)
                : m_shape(shape), m_varKind(varType), m_dataType(type), m_ownerFunction(ownerFunction), m_value(value), m_needsGradient(needsGradient), m_dynamicAxes(dynamicAxes), m_isSparse(isSparse), m_name(name)
            {
                // Validate that each of the dynamic axes are unique
                std::unordered_set<Axis> uniqueDynamicAxis;
                for (auto& currentDynamicAxis : dynamicAxes)
                {
                    auto retVal = uniqueDynamicAxis.insert(currentDynamicAxis);
                    if (!retVal.second)
                        InvalidArgument("Dynamic axis named %S is specified more than once for Variable object", currentDynamicAxis.Name().c_str());
                }
            }
        private:
@ -1079,7 +1149,7 @@ namespace CNTK
        /// Contruct a Placeholder with the specified NDShape
        ///
        explicit Placeholder(const NDShape& shape, const std::wstring& name = L"")
-            : Variable(shape, VariableKind::Placeholder, DataType::Unknown, nullptr, false, {Axis::DefaultDynamicAxis()}, name)
+            : Variable(shape, VariableKind::Placeholder, DataType::Unknown, nullptr, false, { Axis::DefaultDynamicAxis(), Axis::DefaultBatchAxis() }, name)
        {}
        ///
@ -1097,13 +1167,15 @@ namespace CNTK
 }
 namespace std {
-    template <> struct hash<CNTK::Axis>
+    
    template <> struct hash<CNTK::NDShape>
    {
-        size_t operator()(const CNTK::Axis& x) const
+        size_t operator()(const CNTK::NDShape& x) const
        {
-            return std::hash<std::wstring>()(x.Name());
+            return std::hash<std::wstring>()(x.AsString());
        }
    };
    template <> struct hash<CNTK::Variable>
    {
@ -1441,6 +1513,21 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr Softmax(const Variable& operand, const std::wstring& name = L"");
    ///
    /// Create an instance of the CNTK built-in hardmax operation on specified tensor input operand
    ///
    CNTK_API FunctionPtr Hardmax(const Variable& operand, const std::wstring& name = L"");
    ///
    /// Create an instance of the CNTK built-in transpose dimensions operation on specified tensor input operand
    ///
    CNTK_API FunctionPtr TransposeAxes(const Variable& operand, const Axis& axis1, const Axis& axis2, const std::wstring& name = L"");
    ///
    /// Create an instance of the slice operation on specified tensor input operand
    ///
    CNTK_API FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name = L"");
    ///
    /// Create an instance of the CNTK built-in elementwise tensor addition operation with the specified input operands.
    ///
@ -1497,6 +1584,13 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes = 1, const std::wstring& name = L"");
    ///
    /// Create an instance of the CNTK built-in matrix multiplication operation with the transpose of the left input operand
    /// and the specified right operand. Only accepts left operands of ranks 1 or 2.
    /// TODO: Specify the constraints on the shapes of the operands.
    ///
    CNTK_API FunctionPtr TransposeTimes(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes = 1, const std::wstring& name = L"");
    ///
    /// Create an instance of the CNTK built-in operation to compute squared-error for specified input operands.
    ///
@ -1518,7 +1612,6 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name = L"");
    //CNTK_API FunctionPtr PastValue(const Variable& initialState, const Variable& operand, Axis axis, const std::wstring& name = L"");
    ///
    /// Create an instance of the CNTK built-in operation for getting the future value along the lone dynamic axis of the specified operand.
@ -1532,6 +1625,16 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name = L"");
    ///
    /// Create an instance of the CNTK built-in sum reduction operation on specified tensor input operand along the specified axis
    ///
    CNTK_API FunctionPtr ReduceSum(const Variable& operand, const Axis& axis, const std::wstring& name = L"");
    ///
    /// Create an instance of the CNTK built-in LogSum reduction operation on specified tensor input operand along the specified axis
    ///
    CNTK_API FunctionPtr ReduceLogSum(const Variable& operand, const Axis& axis, const std::wstring& name = L"");
    ///
    /// Per dimension mean-variance normalization of the specified input operand.
    ///
@ -1630,6 +1733,7 @@ namespace CNTK
            NDShape,
            Vector,
            Dictionary,
            NDArrayView,
        };
        static const char* TypeName(Type type)
@ -1654,6 +1758,8 @@ namespace CNTK
                return "Vector";
            case Type::Dictionary:
                return "Dictionary";
            case Type::NDArrayView:
                return "NDArrayView";
            default:
                LogicError("Unknown DictionaryValue::Type");
            }
@ -1687,13 +1793,21 @@ namespace CNTK
        DictionaryValue(const wchar_t* value) 
            : DictionaryValue(std::wstring(value))
        {}
        // Due to SWIG we had to flatten this template for vector<DictionaryValue>
        DictionaryValue(const std::vector<CNTK::DictionaryValue>& value) : m_valueType(GetValueType<std::vector<CNTK::DictionaryValue>>())
        {
            AllocateDataPtr(value);
        }
        template <typename T>
        DictionaryValue(const T& value) : m_valueType(GetValueType<T>())
        {
-            static_assert(std::is_same<T, NDShape>::value ||
+            static_assert((std::is_same<T, NDShape>::value ||
-                          std::is_same<T, std::wstring>::value ||
+                std::is_same<T, std::wstring>::value ||
-                          std::is_same<T, std::vector<DictionaryValue>>::value ||
+                std::is_same<T, std::vector<DictionaryValue>>::value ||
-                          std::is_same<T, Dictionary>::value,
+                std::is_same<T, Dictionary>::value ||
                std::is_same<T, NDArrayView>::value),
                          "Unsupported ValueType");
            AllocateDataPtr(value);
@ -1706,6 +1820,12 @@ namespace CNTK
            *this = other;
        }
        DictionaryValue(DictionaryValue&& other) : m_valueType(Type::Bool)
        {
            // The m_valueType must have been set to a non-ptr type to prevent an attempt to interpret
            // the underlying underlying uninitialized value as a ptr and free it.
            *this = std::move(other);
        }
        DictionaryValue& operator=(const DictionaryValue& other)
        {
            if (this != &other)
@ -1723,11 +1843,33 @@ namespace CNTK
                    AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
                else if (other.m_valueType == Type::Dictionary)
                    AllocateDataPtr(other.GetValue<Dictionary>());
                else if (other.m_valueType == Type::NDArrayView)
                    AllocateDataPtr(other.GetValue<NDArrayView>());
            }
            return *this;
        }
        DictionaryValue& operator=(DictionaryValue&& other)
        {
            FreeDataPtr();
            m_valueType = other.m_valueType;
            m_data = other.m_data;
            if (other.m_valueType == Type::String ||
                other.m_valueType == Type::NDShape ||
                other.m_valueType == Type::Vector ||
                other.m_valueType == Type::Dictionary ||
                other.m_valueType == Type::NDArrayView)
            {
                other.m_data.m_ptr = nullptr;
            }
            other.m_valueType = Type::None;
            return *this;
        }
        ~DictionaryValue()
        {
            FreeDataPtr();
@ -1764,7 +1906,8 @@ namespace CNTK
        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value ||
            std::is_same<T, std::wstring>::value ||
            std::is_same<T, std::vector<DictionaryValue>>::value ||
-            std::is_same<T, Dictionary>::value>::type* = nullptr>
+            std::is_same<T, Dictionary>::value ||
            std::is_same<T, NDArrayView>::value>::type* = nullptr>
        const T& GetValue() const
        {
            VerifyType<T>();
@ -1781,21 +1924,25 @@ namespace CNTK
            return m_valueType;
        }
-        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us);
+        CNTK_API bool operator==(const DictionaryValue& other) const;
-        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us);
+        CNTK_API bool operator!=(const DictionaryValue& other) const;
        friend CNTK_API std::istream& operator>>(std::istream& stream, DictionaryValue& us);
        friend CNTK_API std::ostream& operator<<(std::ostream& stream, const DictionaryValue& us);
    private:
        template <typename T>
        static Type GetValueType()
        {
-            static_assert(std::is_same<T, bool>::value ||
+            static_assert((std::is_same<T, bool>::value ||
                          std::is_same<T, size_t>::value ||
                          std::is_same<T, float>::value ||
                          std::is_same<T, double>::value ||
                          std::is_same<T, std::wstring>::value ||
                          std::is_same<T, NDShape>::value ||
-                          std::is_same<T, std::vector<DictionaryValue>>::value ||
+                std::is_same<T, std::vector<DictionaryValue>>::value ||
-                          std::is_same<T, Dictionary>::value,
+                          std::is_same<T, Dictionary>::value ||
                          std::is_same<T, NDArrayView>::value),
                          "Unsupported ValueType");
            if (std::is_same<T, bool>::value)                                      return Type::Bool;
@ -1806,6 +1953,7 @@ namespace CNTK
            if (std::is_same<T, NDShape>::value)                                   return Type::NDShape;
            if (std::is_same<T, std::vector<DictionaryValue>>::value)              return Type::Vector;
            if (std::is_same<T, Dictionary>::value)                                return Type::Dictionary;
            if (std::is_same<T, NDArrayView>::value)                               return Type::NDArrayView;
        }
        template <typename T>
@ -1831,6 +1979,8 @@ namespace CNTK
                FreePtrAsType<std::vector<DictionaryValue>>();
            else if (m_valueType == Type::Dictionary)
                FreePtrAsType<Dictionary>();
            else if (m_valueType == Type::Dictionary)
                FreePtrAsType<NDArrayView>();
        }
        Type m_valueType;
@ -1884,9 +2034,11 @@ namespace CNTK
            return Contains(key.c_str());
        }
        CNTK_API bool operator==(const Dictionary& other) const;
        CNTK_API bool operator!=(const Dictionary& other) const;
-        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us);
+        friend CNTK_API std::istream& operator>>(std::istream& stream, Dictionary& us);
-        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us);
+        friend CNTK_API std::ostream& operator<<(std::ostream& stream, const Dictionary& us);
    private:
        std::shared_ptr<std::unordered_map<std::wstring, DictionaryValue>> m_dictionaryData;
@ -1924,6 +2076,9 @@ namespace CNTK
        ///
        CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& /*checkpoint*/) {}
        ///
        /// Destruct this Learner.
        ///
        virtual ~Learner() {}
    protected:
@ -1935,37 +2090,127 @@ namespace CNTK
    };
    ///
    /// A collection of key-value pairs that represents training parameter schedule in 
    /// terms of the number of processed samples. 
    /// This class provides a number of convenience constructors to allow easy conversion 
    /// from a single value, a vector of values and a list of pairs to the training schedule.
    ///
    template <typename T>
    class TrainingParameterSchedule
    {
    public:
        ///
        /// Create a schedule with a constant parameter value.
        ///
        TrainingParameterSchedule(T value)
            : m_schedule({ std::make_pair(0, value) }), m_unit(1)
        {}
        ///
        /// Create a schedule where the parameter changes its value every 'unit' samples:
        /// schedule[0] is used for the first 'unit' samples, schedule[1] -- for the second,
        /// and so on. The last value is then used repeatedly until the end of training.
        ///
        TrainingParameterSchedule(const std::vector<T>& schedule, size_t unit = 1) 
            : m_unit(unit)
        {
            // TODO: 0 will be used to mean "the entire sweep"
            if (unit == 0)
                RuntimeError("TrainingParameterSchedule::constructor : 'unit' cannot be 0.");
            if (schedule.size() == 0)
                RuntimeError("TrainingParameterSchedule::constructor : schedule is empty.");
            size_t i = 1;
            for (const auto& value : schedule)
            {
                m_schedule[m_unit * i++] = value;
            }
        }
        ///
        /// Create a schedule using the list of key-value pairs, where the key specifies 
        /// the number of 'units' the parameter should maintain the corresponding value.
        /// The value from the last pair is used repeatedly until the end of training.
        /// For example, {{1, 0.05}, {2, 0.1}, {1, 0.005}} and unit = 100, corresponds to 
        /// a schedule where the value of '0.05' is used for the first 100 samples, then
        /// '0.1' is used for the second 200 samples, after which the values is switched
        /// to '0.005'.
        ///
        TrainingParameterSchedule(const std::initializer_list<std::pair<const size_t, T>>& schedule, size_t unit = 1)
            : m_unit(unit)
        {
            // TODO: 0 will be used to mean "the entire sweep"
            if (unit == 0)
                RuntimeError("TrainingParameterSchedule::constructor : 'unit' cannot be 0.");
            if (schedule.size() == 0)
                RuntimeError("TrainingParameterSchedule::constructor : schedule is empty.");
            size_t i = 0;
            for (const auto& it : schedule)
            {
                if (it.first == 0)
                    RuntimeError("TrainingParameterSchedule::constructor : unit count cannot be 0.");
                i += it.first;
                m_schedule[m_unit * i] = it.second;
            }
        }
        ///
        /// Returns a value corresponding to the absolute sample count from the beginning of training.
        ///
        CNTK_API const T& operator[](size_t samleCount) const;
    private:
        std::map<size_t, T> m_schedule;
        size_t m_unit;
    };
    typedef TrainingParameterSchedule<double> LearningRatesPerSample;
    typedef TrainingParameterSchedule<double> MomentumsPerSample;
    ///
    /// Create an instance of the CNTK built-in SGD learner.
    ///
-    /// TODO: add additional SGD parameters here (a collection of learning rate values)
+    CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, 
-    CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, double learningRatePerSample);
+                                   const LearningRatesPerSample& learningRates);
    ///
    /// Create an instance of the CNTK built-in Momentum SGD learner.
    ///
-    /// TODO: add additional Momentum parameters here (a collection of momentum rate values)
+    CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters, 
-    CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters);
+                                           const LearningRatesPerSample& learningRates,
                                           const MomentumsPerSample& momentums);
    ///
    /// Create an instance of the CNTK built-in Nesterov's accelerated SGD learner.
    ///
-    CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters);
+    CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters, 
                                        const LearningRatesPerSample& learningRates,
                                        const MomentumsPerSample& momentums);
    ///
    /// Create an instance of the CNTK built-in AdaGrad learner.
    ///
-    CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier = true);
+    CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters,
                                       const LearningRatesPerSample& learningRates,
                                       bool needAveMultiplier = true);
    ///
    /// Create an instance of the CNTK built-in FSAdaGrad (improved AdaGrad) learner.
    ///
-    CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters);
+    CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters,
                                         const LearningRatesPerSample& learningRates,
                                         const MomentumsPerSample& momentums);
    ///
    /// Create an instance of the CNTK built-in RMSProp learner.
    ///
    CNTK_API LearnerPtr RMSPropLearner(const std::unordered_set<Parameter>& parameters,
                                       const LearningRatesPerSample& learningRates,
                                       double gamma,
                                       double inc,
                                       double dec,
@ -1975,7 +2220,7 @@ namespace CNTK
    ///
    /// Trainer is the top-level abstraction responsible for the orchestration of the training of a model
-    /// using the specified learners and training data either explicilty supplied as Value objects or from
+    /// using the specified learners and training data either explicitly supplied as Value objects or from
    /// a MinibatchSource object.
    ///
    class Trainer
@ -2063,7 +2308,7 @@ namespace CNTK
    };
    ///
-    /// Abstraction for generating minbatches of samples for training/evaluation.
+    /// Abstraction for generating minibatches of samples for training/evaluation.
    ///
    class MinibatchSource : public std::enable_shared_from_this<MinibatchSource>
    {
@ -2079,10 +2324,14 @@ namespace CNTK
        /// #samples or both. In case the size is specified in terms of both #sequences and #samples, the smaller of the 2 is taken. The actual
        /// returned size of the minibatch is the min across all streams. Also the requested MB size fields in the maps are updated by the 
        /// MinibatchSource to contain the actual #sequences and #samples in the returned minibatch for the corresponding stream.
-        /// The return value indciates if the MinibatchSource will return any further data in subsequent calls of this function.
+        /// The return value indicates if the MinibatchSource will return any further data in subsequent calls of this function.
        ///
-        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+        virtual const std::unordered_map<StreamInfo, MinibatchData>& GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) = 0;
        ///
        /// Destruct this MinibatchSource.
        ///
        virtual ~MinibatchSource() {}
        // TODO: Methods to save and restore from checkpoints
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -7,6 +7,12 @@
 #pragma once
 #ifdef SWIG
 #define final
 #define explicit
 #define static_assert(condition, message)
 #endif
 #ifdef _WIN32
 #ifdef CNTKV2LIBRARYDLL
 #define CNTK_API __declspec(dllexport)
@ -47,8 +53,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template <typename ElementType>
    class ComputationNode;
    class File;
 }}}
 // TODO: The following should be reconciled with the equivalent code in the CNTK implementation
@ -133,7 +137,7 @@ namespace CNTK
 #define NOT_IMPLEMENTED                                                                                                              \
    {                                                                                                                                \
        fprintf(stderr, "Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__); \
-        LogicError("Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__);      \
+        CNTK::LogicError("Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__);      \
    }
 #endif
 }
@ -144,6 +148,7 @@ namespace CNTK
    class CompositeFunction;
    class Function;
    class Variable;
    class Axis;
    // Similar to make_shared except that it associates a custom deleter with the shared_ptr to ensure
    // that objects are deleted on the same side of the library DLL where they are allocated
@ -174,4 +179,15 @@ namespace CNTK
    class MinibatchSource;
    typedef std::shared_ptr<MinibatchSource> MinibatchSourcePtr;
    namespace Internal
    {
        CNTK_API FunctionPtr PackedIndex(const Variable& operand, const Variable& index, const std::wstring& name = L"");
        CNTK_API FunctionPtr GatherPacked(const Variable& operand, const Variable& packedIndex, const std::wstring& name = L"");
        CNTK_API FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name = L"");
        CNTK_API FunctionPtr Where(const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
        CNTK_API FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
        CNTK_API FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name = L"");
        CNTK_API FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name = L"");
    }
 }
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -15,6 +15,7 @@
 #include "RecurrentNodes.h"
 #include "EvaluationNodes.h"
 #include "TrainingNodes.h"
 #include "ReshapingNodes.h"
 using namespace Microsoft::MSR::CNTK;
@ -32,6 +33,7 @@ namespace CNTK
        Variable var;
        NDShape varShape = AsNDShape(node->GetSampleLayout());
        // The CNTK sample layouts may have trailing axes with dimension size of 1 which are automatically
        // added when converting from NDShape to CNTK internal TensorShapes and are not present in the original
        // shapes specified by the user. These should be truncated.
@ -57,11 +59,10 @@ namespace CNTK
                if (node->HasMBLayout())
                {
                    // TODO: Currently only default dynamic axis is supported
-                    const std::wstring defaultCNTKDynamicAxisName = L"";
+                    auto inputNodeInternalDynamicAxisName = inputNode->GetRequestedDynamicAxis();
-                    if (inputNode->GetRequestedDynamicAxis() != defaultCNTKDynamicAxisName)
+                    std::vector<Axis> inputVarDynamicAxes = DynamicAxesFromInternalDynamicAxisName(inputNodeInternalDynamicAxisName);
                        LogicError("Found dynamic axis named '%S' while currently only default dynamic axis named '%S' is supported!", node->GetMBLayout()->GetAxisName(), defaultCNTKDynamicAxisName.c_str());
-                    var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, node->GetName());
+                    var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, node->GetName(), inputVarDynamicAxes);
                }
                else
                {
@ -121,6 +122,40 @@ namespace CNTK
                opType = PrimitiveOpType::Reciprocal;
            else if (node->OperationName() == OperationNameOf(SoftmaxNode))
                opType = PrimitiveOpType::Softmax;
            else if (node->OperationName() == OperationNameOf(HardmaxNode))
                opType = PrimitiveOpType::Hardmax;
            else if (node->OperationName() == OperationNameOf(TransposeDimensionsNode))
            {
                auto transposeDimensionsNode = node->As<TransposeDimensionsNode<ElementType>>();
                primitiveFunctionConfigParameters[L"axis1"] = (size_t)transposeDimensionsNode->Axis1();
                primitiveFunctionConfigParameters[L"axis2"] = (size_t)transposeDimensionsNode->Axis2();
                opType = PrimitiveOpType::TransposeAxes;
            }
            else if (node->OperationName() == OperationNameOf(WhereNode))
            {
                auto whereNode = node->As<WhereNode<ElementType>>();
                auto internalDynamicAxisName = whereNode->DynamicAxisName();
                std::vector<Axis> dynamicAxes = DynamicAxesFromInternalDynamicAxisName(internalDynamicAxisName);
                std::vector<std::wstring> dynamicAxesNames;
                for (auto axis : dynamicAxes)
                    dynamicAxesNames.push_back(axis.Name());
                primitiveFunctionConfigParameters[L"newDynamicAxes"] = AsDictionaryValueVector(dynamicAxesNames);
                opType = PrimitiveOpType::Where;
            }
            else if (node->OperationName() == OperationNameOf(SliceNode))
            {
                auto sliceNode = node->As<SliceNode<ElementType>>();
                primitiveFunctionConfigParameters[L"axis"] = Axis(sliceNode->Axis() - 1).Name();
                primitiveFunctionConfigParameters[L"beginIndex"] = sliceNode->BeginIndex();
                primitiveFunctionConfigParameters[L"endIndex"] = sliceNode->EndIndex();
                opType = PrimitiveOpType::Slice;
            }
            else if (node->OperationName() == OperationNameOf(SumElementsNode))
                opType = PrimitiveOpType::SumAll;
            else if (node->OperationName() == OperationNameOf(PlusNode))
                opType = PrimitiveOpType::Plus;
            else if (node->OperationName() == OperationNameOf(MinusNode))
@ -139,11 +174,23 @@ namespace CNTK
                opType = PrimitiveOpType::Greater;
            else if (node->OperationName() == OperationNameOf(GreaterEqualNode))
                opType = PrimitiveOpType::GreaterEqual;
            else if (node->OperationName() == OperationNameOf(PackedIndexNode))
                opType = PrimitiveOpType::PackedIndex;
            else if (node->OperationName() == OperationNameOf(GatherPackedNode))
            {
                std::swap(inputVars[0], inputVars[1]);
                opType = PrimitiveOpType::GatherPacked;
            }
            else if (node->OperationName() == OperationNameOf(TimesNode))
            {
-                primitiveFunctionConfigParameters[L"numOutputAxes"] = DictionaryValue((size_t)node->As<TimesNode<ElementType>>()->OutputRank());
+                primitiveFunctionConfigParameters[L"numOutputAxes"] = (size_t)node->As<TimesNode<ElementType>>()->OutputRank();
                opType = PrimitiveOpType::Times;
            }
            else if (node->OperationName() == OperationNameOf(TransposeTimesNode))
            {
                primitiveFunctionConfigParameters[L"numOutputAxes"] = (size_t)node->As<TransposeTimesNode<ElementType>>()->OutputRank();
                opType = PrimitiveOpType::TransposeTimes;
            }
            else if (node->OperationName() == OperationNameOf(PastValueNode))
            {
                if (inputVars.size() == 1)
@ -151,7 +198,7 @@ namespace CNTK
                    auto initialStateVar = Constant({}, node->As<PastValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
                    inputVars.insert(inputVars.begin(), initialStateVar);
                }
-                primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<PastValueNode<ElementType>>()->TimeStep());
+                primitiveFunctionConfigParameters[L"stepSize"] = (size_t)node->As<PastValueNode<ElementType>>()->TimeStep();
                opType = PrimitiveOpType::PastValue;
            }
            else if (node->OperationName() == OperationNameOf(FutureValueNode))
@ -161,7 +208,7 @@ namespace CNTK
                    auto initialStateVar = Constant({}, node->As<FutureValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
                    inputVars.insert(inputVars.begin(), initialStateVar);
                }
-                primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<FutureValueNode<ElementType>>()->TimeStep());
+                primitiveFunctionConfigParameters[L"stepSize"] = (size_t)node->As<FutureValueNode<ElementType>>()->TimeStep();
                opType = PrimitiveOpType::FutureValue;
            }
            else if (node->OperationName() == OperationNameOf(SquareErrorNode))
@ -176,8 +223,14 @@ namespace CNTK
                std::swap(inputVars[0], inputVars[1]);
                opType = PrimitiveOpType::ClassificationError;
            }
-            else if (node->OperationName() == OperationNameOf(SumElementsNode))
+            else if (node->OperationName() == OperationNameOf(ReduceElementsNode))
-                opType = PrimitiveOpType::ReduceSum;
+            {
                auto reduceElementsNode = node->As<ReduceElementsNode<ElementType>>();
                primitiveFunctionConfigParameters[L"CNTKInternalReductionAxisIndex"] = (size_t)reduceElementsNode->ReductionAxis();
                primitiveFunctionConfigParameters[L"ReductionOpName"] = reduceElementsNode->ReductionOpName();
                opType = PrimitiveOpType::ReduceElements;
            }
            else if (node->OperationName() == OperationNameOf(ConvolutionNode))
            {
                auto convolutionNode = node->As<ConvolutionNode<ElementType>>();
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -14,21 +14,17 @@ namespace CNTK
        return GPUDevice(0);
    }
    /*static*/ const std::wstring Axis::s_staticAxisNamePrefix = L"staticAxis_";
    /*static*/ const Axis& Axis::DefaultDynamicAxis()
    {
        static Axis s_defaultDynamicAxis(L"defaultDynamicAxis");
        return s_defaultDynamicAxis;
    }
-    /*static*/ const Axis& Axis::BatchAxis()
+    /*static*/ const Axis& Axis::DefaultBatchAxis()
    {
-        static Axis s_batchAxis(L"batchAxis");
+        static Axis s_batchAxis(L"defaultBatchAxis");
        return s_batchAxis;
    }
    /*static*/ const Axis& Axis::AllAxes()
    {
        static Axis s_allAxes(L"allAxes");
        return s_allAxes;
    }
 }
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -10,6 +10,10 @@
 #include "Utils.h"
 #include "ComputationNode.h"
 #include "ReshapingNodes.h"
 #include "EvaluationNodes.h"
 #include "TrainingNodes.h"
 #include "LinearAlgebraNodes.h"
 #include "InputAndParamNodes.h"
 using namespace Microsoft::MSR::CNTK;
@ -72,6 +76,17 @@ namespace CNTK
        }
    }
    /*static*/ const std::wstring PrimitiveFunction::InternalSumReductionOpName = L"Sum";
    /*static*/ const std::wstring PrimitiveFunction::InternalLogSumReductionOpName = L"LogSum";
    /*static*/ const std::wstring PrimitiveFunction::InternalMeanReductionOpName = L"Mean";
    /*static*/ const std::wstring PrimitiveFunction::InternalMaxReductionOpName = L"Max";
    /*static*/ const std::wstring PrimitiveFunction::InternalMinReductionOpName = L"Min";
    /*static*/ const std::wstring PrimitiveFunction::InternalAllReductionOpName = L"All";
    /*static*/ const std::wstring PrimitiveFunction::InternalAnyReductionOpName = L"Any";
    /*static*/ std::wstring CompositeFunction::s_internalDefaultDynamicAxisName = L"";
    /*static*/ std::wstring CompositeFunction::s_internalNoSequenceAxisName = L"noSequenceAxis";
    // Replace any PlaceHolder Variables in the graph of Functions underlying 'this' CompositeFunction. All PlaceHolder variables
    // should have been replaced before performing any Forward compute of 'this' Function.
    /*virtual*/ void CompositeFunction::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
@ -122,22 +137,46 @@ namespace CNTK
                computationNodePtr->SetLearningRateMultiplier(0.0);
            NDArrayViewPtr value = variable.IsConstant() ? Constant(variable).Value() : Parameter(variable).Value();
-            auto matrix = variable.IsConstant() ? value->GetMatrix<ElementType>()->AsReference() : value->GetWritableMatrix<ElementType>()->AsReference();
+            std::shared_ptr<const Matrix<ElementType>> valueMatrix = variable.IsConstant() ? value->GetMatrix<ElementType>() : value->GetWritableMatrix<ElementType>();
-            computationNodePtr->Value() = std::move(matrix);
+            if (variable.IsParameter() || (valueMatrix->GetDeviceId() == network->GetDeviceId()))
                computationNodePtr->Value() = valueMatrix->AsReference();
            else
            {
                Matrix<ElementType> clonedMatrix(valueMatrix->GetNumRows(), valueMatrix->GetNumCols(), network->GetDeviceId(), valueMatrix->GetMatrixType(), valueMatrix->GetFormat());
                clonedMatrix.AssignValuesOf(*valueMatrix);
                computationNodePtr->Value() = std::move(clonedMatrix);
            }
        }
        else if (variable.IsInput())
        {
-            // TODO: Support inputs with > 1 dynamic axes
+            // TODO: Input variables currently are required to have the default batch axis
-            if (variable.DynamicAxes().size() != 1)
+            auto dynamicAxes = variable.DynamicAxes();
-                LogicError("Currently only Input variables with one dynamic axis are supported");
+            auto foundDefaultBatchAxis = std::find(dynamicAxes.begin(), dynamicAxes.end(), Axis::DefaultBatchAxis());
            if (foundDefaultBatchAxis == dynamicAxes.end())
                LogicError("Currently Input Variables are required to have the DefaultBatchAxis as one of their dynamic axes");
-            auto dynamicAxis = variable.DynamicAxes()[0];
+            if (dynamicAxes.back() != Axis::DefaultBatchAxis())
-            if (dynamicAxis != Axis::DefaultDynamicAxis())
+                LogicError("Currently Input Variables are required to have the DefaultBatchAxis as their last dynamic axes");
-                LogicError("Currently only Input variables with DefaultDynamicAxis are supported");
+
-            if (IsSparseInput(variable))
+            // TODO: Support inputs with > 1 dynamic axes
-                computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()));
+            if ((dynamicAxes.size() < 1) || (dynamicAxes.size() > 2))
                LogicError("Currently only Input variables with 1 or 2 dynamic axis are supported");
            std::wstring internalDynamicAxisName;
            if (dynamicAxes.size() == 1)
                internalDynamicAxisName = s_internalNoSequenceAxisName;
            else if (dynamicAxes[0] == Axis::DefaultDynamicAxis())
                internalDynamicAxisName = s_internalDefaultDynamicAxisName;
            else
-                computationNodePtr = builder.CreateInputNode(variable.Name(), AsTensorShape(variable.Shape()));
+                internalDynamicAxisName = dynamicAxes[0].Name();
            if (!internalDynamicAxisName.empty())
                network->AddNodeToNetAndAttachInputs(New<DynamicAxisNode<ElementType>>(network->GetDeviceId(), internalDynamicAxisName), {});
            if (IsSparseInput(variable))
                computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()), internalDynamicAxisName);
            else
                computationNodePtr = builder.CreateInputNode(variable.Name(), AsTensorShape(variable.Shape()), internalDynamicAxisName);
            if (variable.NeedsGradient())
            {
@ -219,11 +258,29 @@ namespace CNTK
                computationNodePtr = builder.Reciprocal(input0Node, function->Name());
                break;
            case PrimitiveOpType::Softmax:
                if (functionInputs[0].Shape().NumAxes() > 1)
                    InvalidArgument("Softmax operation can only be applied to a 1D input");
                computationNodePtr = builder.Softmax(input0Node, function->Name());
                break;
            case PrimitiveOpType::Hardmax:
                computationNodePtr = builder.Hardmax(input0Node, function->Name());
                break;
            case PrimitiveOpType::TransposeAxes:
            {
                auto axis1 = Axis(functionConfig[L"axis1"].GetValue<std::wstring>());
                auto axis2 = Axis(functionConfig[L"axis2"].GetValue<std::wstring>());
                // The axis ids passed to the internal CNTK TransposeDimensionsNode are 1 based instead of 0 based
                computationNodePtr = New<TransposeDimensionsNode<ElementType>>(network->GetDeviceId(), function->Name(), (int)(axis1.StaticAxisIndex() + 1), (int)(axis2.StaticAxisIndex() + 1));
                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input0Node });
                break;
            }
            case PrimitiveOpType::Where:
            {
                auto dynamicAxes = variable.DynamicAxes();
                auto internalCNTKWhereNodeDynamicAxisName = (dynamicAxes == std::vector<Axis>({ Axis::DefaultBatchAxis() })) ? CompositeFunction::s_internalNoSequenceAxisName : dynamicAxes[0].Name();
                computationNodePtr = New<WhereNode<ElementType>>(network->GetDeviceId(), function->Name(), internalCNTKWhereNodeDynamicAxisName);
                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input0Node });
                break;
            }
            case PrimitiveOpType::Pooling:
            {
                PoolingType poolingType = (PoolingType)(functionConfig[L"poolingType"].GetValue<size_t>());
@ -235,6 +292,9 @@ namespace CNTK
                computationNodePtr = builder.Pooling(input0Node, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape, true), AsTensorShape(strides, true), autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), ImageLayoutKind::CHW, function->Name());
                break;
            }
            case PrimitiveOpType::SumAll:
                computationNodePtr = builder.Sum(input0Node, function->Name());
                break;
            case PrimitiveOpType::Plus:
                computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
                break;
@ -268,6 +328,12 @@ namespace CNTK
                computationNodePtr = builder.Times(input0Node, input1Node, numOutputAxes, function->Name());
                break;
            }
            case PrimitiveOpType::TransposeTimes:
            {
                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
                computationNodePtr = network->AddNodeToNetAndAttachInputs(New<TransposeTimesNode<ElementType>>(network->GetDeviceId(), function->Name(), numOutputAxes), { input0Node, input1Node });
                break;
            }
            case PrimitiveOpType::Convolution:
            {
                NDShape outputMapCount, kernelShape;
@ -296,35 +362,25 @@ namespace CNTK
            {
                Variable initialStateVar = functionInputs[0];
                Variable inputOperandVar = functionInputs[1];
                // TODO: Current we only support a scalar initial state
                if (!initialStateVar.IsConstant() || (initialStateVar.Shape().NumAxes() > 0))
                    LogicError("Currently PastValue/FutureValue Function only supports scalar initial state");
                // TODO: We currently only support input operand with 1 static axis for PastValue/FutureValue
                if (inputOperandVar.Shape().NumAxes() != 1)
                    LogicError("Currently PastValue/FutureValue Function only supports input operand with 1 static axis");
                // TODO: We currently only support input operand with 1 dynamic axis for PastValue/FutureValue
                if (inputOperandVar.DynamicAxes().size() != 1)
                    LogicError("Currently PastValue/FutureValue Function only supports input operand with 1 dynamic axis");
                // Get the intial state of the PastValue/FutureValue operation
                ElementType initStateValue;
                NDArrayView tempView({}, &initStateValue, 1, DeviceDescriptor::CPUDevice());
                tempView.CopyFrom(*Constant(initialStateVar).Value());
                size_t stepSize = primitiveFunction->FunctionConfig()[L"stepSize"].GetValue<size_t>();
                if (op == PrimitiveOpType::PastValue)
-                    computationNodePtr = builder.PastValue(input1Node, (float)initStateValue, inputOperandVar.Shape()[0], primitiveFunction->FunctionConfig()[L"stepSize"].GetValue<size_t>(), function->Name());
+                    computationNodePtr = builder.PastValue(input1Node, (float)initStateValue, inputOperandVar.Shape().TotalSize(), stepSize, function->Name());
                else
-                    computationNodePtr = builder.FutureValue(input1Node, (float)initStateValue, inputOperandVar.Shape()[0], primitiveFunction->FunctionConfig()[L"stepSize"].GetValue<size_t>(), function->Name());
+                    computationNodePtr = builder.FutureValue(input1Node, (float)initStateValue, inputOperandVar.Shape().TotalSize(), stepSize, function->Name());
                break;
            }
-            case PrimitiveOpType::ReduceSum:
+            case PrimitiveOpType::ReduceElements:
            {
-                // TODO: Use the new ReduceElements node instead of the legacy SumElements node for reduction. Currently ReduceElements has incorrect MBLayout inference.
+                auto CNTKInternalReductionAxisIndex = (int)functionConfig[L"CNTKInternalReductionAxisIndex"].GetValue<size_t>();
-                //computationNodePtr = network->AddNodeToNetAndAttachInputs(New<ReduceElementsNode<ElementType>>(network->GetDeviceId(), function->Name(), L"Sum", 0), { input0Node });
+                auto reductionOpName = functionConfig[L"ReductionOpName"].GetValue<std::wstring>();
-                computationNodePtr = builder.Sum(input0Node, function->Name());
+                computationNodePtr = network->AddNodeToNetAndAttachInputs(New<ReduceElementsNode<ElementType>>(network->GetDeviceId(), function->Name(), reductionOpName, CNTKInternalReductionAxisIndex), { input0Node });
                break;
            }
            case PrimitiveOpType::BatchNormalization:
@ -353,6 +409,25 @@ namespace CNTK
                computationNodePtr = variableToNodeMap[variable];
                break;
            case PrimitiveOpType::PackedIndex:
                computationNodePtr = New<PackedIndexNode<ElementType>>(network->GetDeviceId(), function->Name());
                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input0Node, input1Node });
                break;
            case PrimitiveOpType::GatherPacked:
                computationNodePtr = New<GatherPackedNode<ElementType>>(network->GetDeviceId(), function->Name());
                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input1Node, input0Node });
                break;
            case PrimitiveOpType::Slice:
            {
                auto axis = Axis(functionConfig[L"axis"].GetValue<std::wstring>());
                int beginIndex = functionConfig[L"beginIndex"].GetValue<size_t>();
                int endIndex = functionConfig[L"endIndex"].GetValue<size_t>();
                // Internal CNTK SliceNode takes 1 based axis indices instead of 0 based
                computationNodePtr = New<SliceNode<ElementType>>(network->GetDeviceId(), function->Name(), beginIndex, endIndex, (int)(axis.StaticAxisIndex() + 1));
                network->AddNodeToNetAndAttachInputs(computationNodePtr, { input0Node });
                break;
            }
            default:
                LogicError("Specified op %s not yet supported", PrimitiveOpTypeName(op));
                break;
@ -486,11 +561,11 @@ namespace CNTK
        if (value->Data()->Shape().NumAxes() == var.Shape().NumAxes())
            return{ value->Data()->GetMatrix<ElementType>(), nullptr };
-        if (value->Data()->Shape().NumAxes() != (var.Shape().NumAxes() + var.DynamicAxes().size() + 1))
+        if (value->Data()->Shape().NumAxes() < (var.Shape().NumAxes() + var.DynamicAxes().size()))
-            InvalidArgument("Value's number of axes should be larger than the Variable's number of axes by 1 + number of dynamic axes");
+            InvalidArgument("Value's number of axes should be larger than the Variable's number of axes by number of dynamic axes");
-        if (var.DynamicAxes().size() > 1)
+        if (var.DynamicAxes().size() > 2)
-            LogicError("More than one dynamic axis for a variable is currently unsupported");
+            LogicError("More than 2 dynamic axis for a variable is currently unsupported");
        size_t maxNumTimeSteps = value->Data()->Shape()[var.Shape().NumAxes()];
        size_t numSequences = value->Data()->Shape()[var.Shape().NumAxes() + 1];
@ -618,9 +693,9 @@ namespace CNTK
                sequenceLengths.push_back(sequenceInfo.GetNumTimeSteps());
        }
-        // Reshuffle to data to unpack and uninterleave the CNTK form data
+        // Reshuffle to data to unpack and uninterleave the CNTK form packed data
-        // Now generate the gather indices
+        // Now generate the scatter indices
-        auto shuffledMatrixData = std::make_shared<Matrix<ElementType>>(matrix.GetNumRows(), maxNumTimeSteps * numSequences, matrix.GetDeviceId());
+        auto shuffledMatrixData = std::make_shared<Matrix<ElementType>>(matrix.GetNumRows(), maxNumTimeSteps * numSequences, matrix.GetDeviceId(), matrix.GetMatrixType(), matrix.GetFormat());
        std::vector<size_t> sequencesShorterThanLongestSequence;
        for (size_t i = 0; i < numSequences; ++i)
@ -659,15 +734,15 @@ namespace CNTK
        }
        auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorShape(valueDataShape));
-        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, readOnly, tensorView);
+        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(shuffledMatrixData->GetFormat()), valueDataShape, readOnly, tensorView);
        return MakeSharedObject<Value>(data, mask);
    }
    template <typename ElementType>
    /*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
    {
-        if (var.DynamicAxes().size() > 1)
+        if (var.DynamicAxes().size() > 2)
-            LogicError("More than one dynamic axis for a variable is currently unsupported");
+            LogicError("More than 2 dynamic axis for a variable is currently unsupported");
        if (AsDataType<ElementType>() != var.GetDataType())
            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(var.GetDataType()));
@ -732,7 +807,7 @@ namespace CNTK
        MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
        auto nodeLayout = computationNode->GetMBLayout();
        if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
-            InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
+            InvalidArgument("The layout of the specified gradient Value is incompatible with the layout of the corresponding Variable computed during Forward call");
        computationNode->As<ComputationNode<ElementType>>()->AssignGradient(*CNTKMatrixAndMBLayout.first);
    }
@ -814,12 +889,9 @@ namespace CNTK
        }
        if (varValue == nullptr)
-        {
+            varValue = nodeValue->DeepClone();
-            auto data = MakeSharedObject<NDArrayView>(var.GetDataType(), valueShape, AsDeviceDescriptor(computationNode->ValuePtr()->GetDeviceId()));
+        else
-            auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
+            varValue->CopyFrom(*nodeValue);
            varValue = MakeSharedObject<Value>(data, mask);
        }
        varValue->CopyFrom(*nodeValue);
    }
    void CompositeFunction::GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs)
@ -984,7 +1056,7 @@ namespace CNTK
    FunctionPtr Round(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return Floor(Plus(operand, Constant(NDShape({}), 0.5f)), name);
+        return Floor(Plus(operand, ScalarConstant(operand.GetDataType(), 0.5f)), name);
    }
    FunctionPtr Floor(const Variable& operand, const std::wstring& name/* = L""*/)
@ -1012,6 +1084,71 @@ namespace CNTK
        return UnaryOp(PrimitiveOpType::Softmax, operand, Dictionary(), name);
    }
    FunctionPtr Hardmax(const Variable& operand, const std::wstring& name/* = L""*/)
    {
        return UnaryOp(PrimitiveOpType::Hardmax, operand, Dictionary(), name);
    }
    FunctionPtr TransposeAxes(const Variable& operand, const Axis& axis1, const Axis& axis2, const std::wstring& name /*= L""*/)
    {
        if (!axis1.IsStaticAxis() || !axis2.IsStaticAxis())
            LogicError("TransposeAxes currently does not support transposing dynamic axes");
        auto additionalProperties = Dictionary();
        additionalProperties[L"axis1"] = axis1.Name();
        additionalProperties[L"axis2"] = axis2.Name();
        return UnaryOp(PrimitiveOpType::TransposeAxes, operand, std::move(additionalProperties), name);
    }
    FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name /*= L""*/)
    {
        if ((endIndex - beginIndex) <= 0)
            InvalidArgument("CNTK::Slice: endIndex (%d) - beginIndex (%d) must be a positive number", endIndex, beginIndex);
        if (axis == Axis::DefaultBatchAxis())
            LogicError("Slice is currently unsupported along the batch axis");
        if (axis.IsStaticAxis())
            return Internal::Slice(operand, axis, beginIndex, endIndex, name);
        auto operandAxes = operand.DynamicAxes();
        auto findAxis = std::find(operandAxes.begin(), operandAxes.end(), axis);
        if (findAxis == operandAxes.end())
            InvalidArgument("The specified dynamic axis named %S does not match any of the dynamic axes of the operand", axis.Name().c_str());
        auto beginFlagsLambda = [beginIndex, operand]() {
            return (beginIndex > 0) ? Minus(ScalarConstant(operand.GetDataType(), 1.0f), Internal::IsWithin(operand, beginIndex)) : Internal::IsWithin(operand, beginIndex);
        };
        auto endFlagsLambda = [endIndex, operand]() {
            return (endIndex > 0) ? Internal::IsWithin(operand, endIndex) : Minus(ScalarConstant(operand.GetDataType(), 1.0f), Internal::IsWithin(operand, endIndex));
        };
        FunctionPtr flags;
        if (beginIndex == 0)
            flags = endFlagsLambda();
        else if (endIndex == 0)
            flags = beginFlagsLambda();
        else
            flags = ElementTimes(beginFlagsLambda(), endFlagsLambda());
        // Since we are slicing along a dynamic axis, the output variable's dynamic axes will be different than the operand
        std::vector<Axis> newDynamicAxes;
        for (auto operandAxis : operandAxes)
        {
            if (operandAxis == axis)
            {
                // If we are selecting just one frame from the dynamic axis, we can remove that axis
                if ((endIndex - beginIndex) > 1)
                    newDynamicAxes.push_back(CompositeFunction::NextAutoGeneratedDynamicAxis());
            }
            else
                newDynamicAxes.push_back(operandAxis);
        }
        return Internal::Gather(operand, flags, newDynamicAxes);
    }
    FunctionPtr BinaryOp(PrimitiveOpType op, const Variable& leftOperand, const Variable& rightOperand, Dictionary&& opConfig, const std::wstring& name)
    {
        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(op, std::vector<Variable>({ leftOperand, rightOperand }), std::move(opConfig), name), name);
@ -1074,6 +1211,13 @@ namespace CNTK
        return BinaryOp(PrimitiveOpType::Times, leftOperand, rightOperand, std::move(additionalProperties), name);
    }
    FunctionPtr TransposeTimes(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes /*= 1*/, const std::wstring& name/* = L""*/)
    {
        auto additionalProperties = Dictionary();
        additionalProperties[L"numOutputAxes"] = numOutputAxes;
        return BinaryOp(PrimitiveOpType::TransposeTimes, leftOperand, rightOperand, std::move(additionalProperties), name);
    }
    FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name/* = L""*/)
    {
        return BinaryOp(PrimitiveOpType::SquaredError, prediction, targets, Dictionary(), name);
@ -1081,18 +1225,20 @@ namespace CNTK
    FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return BinaryOp(PrimitiveOpType::CrossEntropyWithSoftmax, prediction, labels, Dictionary(), name);
+        return ReduceSum(Minus(ReduceLogSum(prediction, Axis(0)), TransposeTimes(labels, prediction)), name);
        //return BinaryOp(PrimitiveOpType::CrossEntropyWithSoftmax, prediction, labels, Dictionary(), name);
    }
    FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return BinaryOp(PrimitiveOpType::ClassificationError, prediction, labels, Dictionary(), name);
+        return ReduceSum(Minus(ScalarConstant(prediction.GetDataType(), 1.0f), TransposeTimes(labels, Hardmax(prediction))), name);
        //return BinaryOp(PrimitiveOpType::ClassificationError, prediction, labels, Dictionary(), name);
    }
    FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
    {
-        if (operand.DynamicAxes().size() != 1)
+        if (operand.DynamicAxes().size() != 2)
-            InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
+            InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic sequence-axis");
        auto additionalProperties = Dictionary();
        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
@ -1101,8 +1247,8 @@ namespace CNTK
    FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
    {
-        if (operand.DynamicAxes().size() != 1)
+        if (operand.DynamicAxes().size() != 2)
-            InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
+            InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic sequence-axis");
        auto additionalProperties = Dictionary();
        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
@ -1111,7 +1257,17 @@ namespace CNTK
    FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return UnaryOp(PrimitiveOpType::ReduceSum, operand, Dictionary(), name);
+        return UnaryOp(PrimitiveOpType::SumAll, operand, Dictionary(), name);
    }
    FunctionPtr ReduceSum(const Variable& operand, const Axis& axis, const std::wstring& name/* = L""*/)
    {
        return Internal::ReduceElements(operand, PrimitiveFunction::InternalSumReductionOpName, axis, name);
    }
    FunctionPtr ReduceLogSum(const Variable& operand, const Axis& axis, const std::wstring& name/* = L""*/)
    {
        return Internal::ReduceElements(operand, PrimitiveFunction::InternalLogSumReductionOpName, axis, name);
    }
    FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name /*= L""*/)
@ -1207,4 +1363,94 @@ namespace CNTK
        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
    }
    namespace Internal
    {
        FunctionPtr PackedIndex(const Variable& operand, const Variable& index, const std::wstring& name /*= L""*/)
        {
            return BinaryOp(PrimitiveOpType::PackedIndex, operand, index, Dictionary(), name);
        }
        FunctionPtr GatherPacked(const Variable& operand, const Variable& packedIndex, const std::wstring& name /*= L""*/)
        {
            return BinaryOp(PrimitiveOpType::GatherPacked, operand, packedIndex, Dictionary(), name);
        }
        FunctionPtr ZeroesLike(const Variable& operand)
        {
            if (operand.Shape().NumAxes() > 1)
                LogicError("ZerosLike currently does not support operands with more than 1 static axes");
            auto rowSliceFunc = Internal::Slice(operand, Axis(0), 0, 1);
            return Minus(rowSliceFunc, rowSliceFunc);
        }
        FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name /*= L""*/)
        {
            if (offset == 0)
                InvalidArgument("Internal::CNTK::IsWithin: The offset must be positive");
            if (offset > 0)
                return PastValue(ScalarConstant(operand.GetDataType(), 1.0f), ZeroesLike(operand), offset, name);
            else
                return FutureValue(ScalarConstant(operand.GetDataType(), 1.0f), ZeroesLike(operand), -offset, name);
        }
        FunctionPtr Where(const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name /*= L""*/)
        {
            auto additionalProperties = Dictionary();
            std::vector<std::wstring> newDynamicAxesNames;
            for (auto axis : newDynamicAxes)
                newDynamicAxesNames.push_back(axis.Name());
            additionalProperties[L"newDynamicAxes"] = AsDictionaryValueVector(newDynamicAxesNames);
            return UnaryOp(PrimitiveOpType::Where, condition, std::move(additionalProperties), name);
        }
        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name /*= L""*/)
        {
            return Internal::GatherPacked(operand, Internal::PackedIndex(operand, Where(condition, newDynamicAxes)));
        }
        FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name /*= L""*/)
        {
            auto additionalProperties = Dictionary();
            additionalProperties[L"axis"] = axis.Name();
            additionalProperties[L"beginIndex"] = (size_t)beginIndex;
            additionalProperties[L"endIndex"] = (size_t)endIndex;
            return UnaryOp(PrimitiveOpType::Slice, operand, std::move(additionalProperties), name);
        }
        FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name /*= L""*/)
        {
            using namespace std::placeholders;
            if (axis.IsStaticAxis())
            {
                auto additionalProperties = Dictionary();
                additionalProperties[L"CNTKInternalReductionAxisIndex"] = (size_t)(axis.StaticAxisIndex() + 1);
                additionalProperties[L"ReductionOpName"] = reductionOpName;
                return UnaryOp(PrimitiveOpType::ReduceElements, operand, std::move(additionalProperties), name);
            }
            if (axis == Axis::DefaultBatchAxis())
                LogicError("Reduction is currently unsupported along the batch axis");
            if (reductionOpName != PrimitiveFunction::InternalSumReductionOpName)
                LogicError("%S reduction along dynamic axis is currently unsupported", reductionOpName.c_str());
            std::function<FunctionPtr(const Variable& leftOperand, const Variable& rightOperand)> reductionFunctor;
            if (reductionOpName == PrimitiveFunction::InternalSumReductionOpName)
                reductionFunctor = std::bind(Plus, _1, _2, L"");
            // We are reducing over a dynamic axis which is currently implemented using recurrence
            auto cumulativeSumFunctionPlaceholder = Placeholder(operand.Shape());
            auto prevAccumulatedValuesFunction = PastValue(ScalarConstant(operand.GetDataType(), 0.0f), cumulativeSumFunctionPlaceholder, 1);
            auto cumulativeSumFunction = reductionFunctor(prevAccumulatedValuesFunction, operand);
            cumulativeSumFunction->ReplacePlaceholders({ { cumulativeSumFunctionPlaceholder, cumulativeSumFunction } });
            return CNTK::Slice(cumulativeSumFunction, axis, -1, 0);
        }
   }
 }
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -27,7 +27,12 @@ namespace CNTK
        Abs,
        Reciprocal,
        Softmax,
        Hardmax,
        TransposeAxes,
        Where,
        Slice,
        Pooling,
        SumAll,
        Plus,
        Minus,
        ElementTimes,
@ -37,14 +42,17 @@ namespace CNTK
        LessEqual,
        Greater,
        GreaterEqual,
        PackedIndex,
        GatherPacked,
        Times,
        TransposeTimes,
        Convolution,
        SquaredError,
        CrossEntropyWithSoftmax,
        ClassificationError,
        PastValue,
        FutureValue,
-        ReduceSum,
+        ReduceElements,
        BatchNormalization,
        Combine,
    };
@ -77,7 +85,12 @@ namespace CNTK
            { PrimitiveOpType::Abs, "Abs" },
            { PrimitiveOpType::Reciprocal, "Reciprocal" },
            { PrimitiveOpType::Softmax, "Softmax" },
            { PrimitiveOpType::Hardmax, "Hardmax" },
            { PrimitiveOpType::TransposeAxes, "TransposeAxes" },
            { PrimitiveOpType::Where, "Where" },
            { PrimitiveOpType::Slice, "Slice" },
            { PrimitiveOpType::Pooling, "Pooling" },
            { PrimitiveOpType::SumAll, "SumAll" },
            { PrimitiveOpType::Plus, "Plus" },
            { PrimitiveOpType::Minus, "Minus" },
            { PrimitiveOpType::ElementTimes, "ElementTimes" },
@ -87,14 +100,17 @@ namespace CNTK
            { PrimitiveOpType::LessEqual, "LessEqual" },
            { PrimitiveOpType::Greater, "Greater" },
            { PrimitiveOpType::GreaterEqual, "GreaterEqual" },
            { PrimitiveOpType::PackedIndex, "PackedIndex" },
            { PrimitiveOpType::GatherPacked, "GatherPacked" },
            { PrimitiveOpType::Times, "Times" },
            { PrimitiveOpType::TransposeTimes, "TransposeTimes" },
            { PrimitiveOpType::Convolution, "Convolution" },
            { PrimitiveOpType::SquaredError, "SquaredError" },
            { PrimitiveOpType::CrossEntropyWithSoftmax, "CrossEntropyWithSoftmax" },
            { PrimitiveOpType::ClassificationError, "ClassificationError" },
            { PrimitiveOpType::PastValue, "PastValue" },
            { PrimitiveOpType::FutureValue, "FutureValue" },
-            { PrimitiveOpType::ReduceSum, "ReduceSum" },
+            { PrimitiveOpType::ReduceElements, "ReduceElements" },
            { PrimitiveOpType::BatchNormalization, "BatchNormalization" },
            { PrimitiveOpType::Combine, "Combine" }
        };
@ -107,6 +123,15 @@ namespace CNTK
    class PrimitiveFunction final : public Function
    {
    public:
        static const std::wstring InternalSumReductionOpName;
        static const std::wstring InternalLogSumReductionOpName;
        static const std::wstring InternalMeanReductionOpName;
        static const std::wstring InternalMaxReductionOpName;
        static const std::wstring InternalMinReductionOpName;
        static const std::wstring InternalAllReductionOpName;
        static const std::wstring InternalAnyReductionOpName;
    public:
        PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName = L"")
            : Function(inputs, GetOutputVariables(op, inputs, this, functionConfig), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
@ -242,16 +267,26 @@ namespace CNTK
            DataType outputDataType = inputs[0].GetDataType();
            // We currently require that the inputs' dynamic axes if any match
-            std::vector<Axis> outputDynamicAxes = inputs[0].DynamicAxes();
+            std::vector<Axis> outputDynamicAxes;
-            for (auto inputVar : inputs)
+            if (op == PrimitiveOpType::Where)
                ;
            else if ((op == PrimitiveOpType::PackedIndex) || (op == PrimitiveOpType::GatherPacked))
            {
-                auto currentInputDynamicAxes = inputVar.DynamicAxes();
+                outputDynamicAxes = inputs[1].DynamicAxes();
-                if (outputDynamicAxes.empty())
+            }
-                    outputDynamicAxes = currentInputDynamicAxes;
+            else
-                else
+            {
                outputDynamicAxes = inputs[0].DynamicAxes();
                for (auto inputVar : inputs)
                {
-                    if (!currentInputDynamicAxes.empty() && (currentInputDynamicAxes != outputDynamicAxes))
+                    auto currentInputDynamicAxes = inputVar.DynamicAxes();
-                        LogicError("Currently if an operand of a binary elementwise operation has any dynamic axes, those must match the dynamic axes of the other operand");
+                    if (outputDynamicAxes.empty())
                        outputDynamicAxes = currentInputDynamicAxes;
                    else
                    {
                        if (!currentInputDynamicAxes.empty() && (currentInputDynamicAxes != outputDynamicAxes))
                            LogicError("Currently if an operand of a binary elementwise operation has any dynamic axes, those must match the dynamic axes of the other operand");
                    }
                }
            }
@ -268,9 +303,38 @@ namespace CNTK
            case PrimitiveOpType::Abs:
            case PrimitiveOpType::Reciprocal:
            case PrimitiveOpType::Softmax:
            case PrimitiveOpType::Hardmax:
                assert(inputs.size() == 1);
                if (((op == PrimitiveOpType::Softmax) || (op == PrimitiveOpType::Hardmax)) && (inputs[0].Shape().NumAxes() > 1))
                    InvalidArgument("Softmax/Hardmax operation can only be applied to a 1D input");
                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
            case PrimitiveOpType::TransposeAxes:
            {
                assert(inputs.size() == 1);
                auto axis1 = Axis(functionConfig[L"axis1"].GetValue<std::wstring>());
                auto axis2 = Axis(functionConfig[L"axis2"].GetValue<std::wstring>());
                if (!axis1.IsStaticAxis() || !axis2.IsStaticAxis())
                    LogicError("TransposeAxes operation currently does not support transposing dynamic axes");
                auto transposedTensorShape = AsTensorShape(inputs[0].Shape(), true);
                transposedTensorShape.SwapDimsInPlace(axis1.StaticAxisIndex(), axis2.StaticAxisIndex());
                outputs.push_back(Variable(AsNDShape(transposedTensorShape), outputDataType, owner, outputDynamicAxes));
                break;
            }
            case PrimitiveOpType::Where:
            {
                assert(inputs.size() == 1);
                std::vector<Axis> newDynamicAxes;
                auto newDynamicAxesNames = AsBasicElementTypeVector<std::wstring>(functionConfig[L"newDynamicAxes"].GetValue<std::vector<DictionaryValue>>());
                for (auto axisName : newDynamicAxesNames)
                    newDynamicAxes.push_back(Axis(axisName));
                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, newDynamicAxes));
                break;
            }
            case PrimitiveOpType::Pooling:
            {
                assert(inputs.size() == 1);
@ -282,6 +346,10 @@ namespace CNTK
                outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[0].Shape(), poolingWindowsShape, { 1 }, strides, { true }, autoPadding, lowerPad, upperPad, false), outputDataType, owner, outputDynamicAxes));
                break;
            }
            case PrimitiveOpType::SumAll:
                assert(inputs.size() == 1);
                outputs.push_back(Variable({}, outputDataType, owner, std::vector<Axis>({})));
                break;
            case PrimitiveOpType::Plus:
            case PrimitiveOpType::Minus:
            case PrimitiveOpType::ElementTimes:
@ -297,15 +365,26 @@ namespace CNTK
            case PrimitiveOpType::Times:
            {
                assert(inputs.size() == 2);
                // TODO: Support dynamic axes on the left operand
                if (!inputs[0].DynamicAxes().empty())
                    LogicError("Dynamic axes are currently unsupported for left operand of a Times operation");
                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
                outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape(), numOutputAxes), outputDataType, owner, outputDynamicAxes));
                break;
            }
            case PrimitiveOpType::TransposeTimes:
            {
                assert(inputs.size() == 2);
                auto numLeftOperandAxes = inputs[0].Shape().NumAxes();
                if (numLeftOperandAxes > 2)
                    InvalidArgument("TransposeTimes operation only supports left operands of rank 1 or 2");
                NDShape transposedLeftOperandShape(2, 1);
                for (size_t i = 0; i < numLeftOperandAxes; ++i)
                    transposedLeftOperandShape[transposedLeftOperandShape.NumAxes() - i - 1] = inputs[0].Shape()[i];
                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
                outputs.push_back(Variable(TimesOpOutputShape(transposedLeftOperandShape, inputs[1].Shape(), numOutputAxes), outputDataType, owner, outputDynamicAxes));
                break;
            }
            case PrimitiveOpType::Convolution:
            {
                assert(inputs.size() == 2);
@ -341,26 +420,45 @@ namespace CNTK
                for (size_t i = 0; i < inputs[0].Shape().NumAxes(); ++i)
                    reductionAxes.push_back(i);
-                outputs.push_back(Variable(ReductionOpOutputShape(op, predictionShape, reductionAxes), outputDataType, owner, {}));
+                outputs.push_back(Variable(ReductionOpOutputShape(op, predictionShape, reductionAxes), outputDataType, owner, std::vector<Axis>({})));
                break;
            }
            case PrimitiveOpType::PastValue:
            case PrimitiveOpType::FutureValue:
            {
                assert(inputs.size() == 2);
                Variable initialStateVar = inputs[0];
                Variable inputOperandVar = inputs[1];
                // TODO: Current we only support a scalar initial state
                if (!initialStateVar.IsConstant() || (initialStateVar.Shape().NumAxes() > 0))
                    LogicError("Currently PastValue/FutureValue Function only supports scalar initial state");
                // TODO: We currently only support input operand with 1 static axis for PastValue/FutureValue
                if (inputOperandVar.Shape().NumAxes() > 1)
                    LogicError("Currently PastValue/FutureValue Function only supports input operand with <= 1 static axis");
                // TODO: We currently only support input operand with 1 dynamic axis for PastValue/FutureValue
                if (inputOperandVar.DynamicAxes().size() != 2)
                    LogicError("Currently PastValue/FutureValue Function only supports input operand with with 2 dynamic axis (1 sequence-axis and 1 batch-axis)");
                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
-            case PrimitiveOpType::ReduceSum:
+            }
            case PrimitiveOpType::ReduceElements:
            {
                assert(inputs.size() == 1);
-
+                auto CNTKInternalReductionAxisIndex = functionConfig[L"CNTKInternalReductionAxisIndex"].GetValue<size_t>();
                // TODO: For reductions, we should remove any of the dynamic axes from 'outputDynamicAxes' that are being reduced over. 
                // Currently we only support reductions that reduce over all axes
                std::vector<Axis> reductionOutputDynamicAxes = {};
                std::vector<size_t> reductionAxes;
-                for (size_t i = 0; i < inputs[0].Shape().NumAxes(); ++i)
+                // TODO: Do not use a integer literal for the special value of axis id that indicates all static axes
-                    reductionAxes.push_back(i);
+                if (CNTKInternalReductionAxisIndex == 0)
                {
                    for (size_t i = 0; i < inputs[0].Shape().NumAxes(); ++i)
                        reductionAxes.push_back(i);
                }
                else
                    reductionAxes.push_back(CNTKInternalReductionAxisIndex - 1);
-                outputs.push_back(Variable(ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes), outputDataType, owner, reductionOutputDynamicAxes));
+                outputs.push_back(Variable(ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes), outputDataType, owner, inputs[0].DynamicAxes()));
                break;
            }
            case PrimitiveOpType::BatchNormalization:
@ -369,6 +467,60 @@ namespace CNTK
            case PrimitiveOpType::Combine:
                outputs = inputs;
                break;
            case PrimitiveOpType::PackedIndex:
                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
            case PrimitiveOpType::GatherPacked:
            {
                bool sourceHasDynamicAxis = !inputs[0].DynamicAxes().empty();
                NDShape outputShape;
                // inherit tensor dimension from sourceData, minus the last (column or time) dimension. TODO this needs to become simpler...
                if (sourceHasDynamicAxis)
                    outputShape = inputs[0].Shape();
                else
                {
                    if (inputs[0].Shape().NumAxes() > 1)
                        outputShape = outputShape.SubShape(0, outputShape.NumAxes() - 1);
                    else
                        outputShape = {};
                }
                outputs.push_back(Variable(outputShape, outputDataType, owner, outputDynamicAxes));
                break;
            }
            case PrimitiveOpType::Slice:
            {
                auto axis = Axis(functionConfig[L"axis"].GetValue<std::wstring>());
                int beginIndex = functionConfig[L"beginIndex"].GetValue<size_t>();
                int endIndex = functionConfig[L"endIndex"].GetValue<size_t>();
                if (!axis.IsStaticAxis())
                    LogicError("Built-in Slice operation currently does not support slicing along dynamic axis");
                if (axis.StaticAxisIndex() >= inputs[0].Shape().NumAxes())
                    InvalidArgument("The specified axis index (%d) for the Slice operation is outside the bounds of the available axes of the input", (int)axis.StaticAxisIndex());
                size_t sliceAxisDim = inputs[0].Shape()[axis.StaticAxisIndex()];
                int realBeginIndex = (beginIndex >= 0) ? beginIndex : beginIndex + sliceAxisDim;
                int realEndIndex = (endIndex > 0) ? endIndex : endIndex + sliceAxisDim;
                if ((sliceAxisDim < realEndIndex) || (realEndIndex < realBeginIndex) || (realBeginIndex < 0))
                    RuntimeError("Slice operation: Index range [%d,%d), interpreted as [%d,%d), is invalid for input ([%S]).",
                                 beginIndex,
                                 endIndex,
                                 realBeginIndex,
                                 realEndIndex,
                                 inputs[0].Shape().AsString().c_str());
                auto outputTensorShape = AsTensorShape(inputs[0].Shape(), true);
                // propagate as much as we can
                if ((axis.StaticAxisIndex() < outputTensorShape.GetRank()) && (0 <= realBeginIndex) && (realBeginIndex <= realEndIndex) && (realEndIndex <= sliceAxisDim))
                    outputTensorShape.NarrowTo(axis.StaticAxisIndex(), realBeginIndex, realEndIndex);
                outputs.push_back(Variable(AsNDShape(outputTensorShape), outputDataType, owner, outputDynamicAxes));
                break;
            }
            default:
                LogicError("Specified op %s not yet supported", PrimitiveOpTypeName(op));
                break;
@ -417,6 +569,17 @@ namespace CNTK
                                                         std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
                                                         const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/);
    public:
        static std::wstring s_internalDefaultDynamicAxisName;
        static std::wstring s_internalNoSequenceAxisName;
        static Axis NextAutoGeneratedDynamicAxis()
        {
            static std::atomic<unsigned int> nextAutoGeneratedDynamicAxis(0);
            static const std::wstring autoGeneratedDynamicAxisNamePrefix = L"autoGeneratedDynamicAxis_";
            return Axis(autoGeneratedDynamicAxisNamePrefix + std::to_wstring(nextAutoGeneratedDynamicAxis++));
        }
    public:
        static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
        {
@ -524,4 +687,17 @@ namespace CNTK
        // the next 'Backward' call.
        std::unordered_set<Variable> m_currentBackpropRoots;
    };
    inline std::vector<CNTK::Axis> DynamicAxesFromInternalDynamicAxisName(const std::wstring& internalDynamicAxisName)
    {
        std::vector<CNTK::Axis> inputVarDynamicAxes;
        if (internalDynamicAxisName == CNTK::CompositeFunction::s_internalDefaultDynamicAxisName)
            inputVarDynamicAxes = { CNTK::Axis::DefaultDynamicAxis(), CNTK::Axis::DefaultBatchAxis() };
        else if (internalDynamicAxisName == CNTK::CompositeFunction::s_internalNoSequenceAxisName)
            inputVarDynamicAxes = { CNTK::Axis::DefaultBatchAxis() };
        else
            inputVarDynamicAxes = { CNTK::Axis(internalDynamicAxisName), CNTK::Axis::DefaultBatchAxis() };
        return inputVarDynamicAxes;
    }
 }
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -8,19 +8,18 @@
 #include "Utils.h"
 #define UPDATE_FUNCTION                                                                                       \
-    switch (smoothedGradientValue->GetDataType())                                                     \
+    switch (smoothedGradientValue->GetDataType())                                                             \
    {                                                                                                         \
    case DataType::Float:                                                                                     \
-        Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);  \
+        Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                  \
        break;                                                                                                \
    case DataType::Double:                                                                                    \
-        Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
+        Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                 \
        break;                                                                                                \
    default:                                                                                                  \
        NOT_IMPLEMENTED;                                                                                      \
    }
 using namespace Microsoft::MSR::CNTK;
 using namespace std;
@ -141,7 +140,7 @@ namespace CNTK
        // L1 regularizer with proximal gradient descent method
        if (m_additionalOptions.l1RegularizationWeight > 0)
        {
-            auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+            auto learningRate = ElementType(m_learningRates[m_sampleCount]);
            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
            auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
            parameterValue->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
@ -154,48 +153,49 @@ namespace CNTK
        return arrayView->GetWritableTensorView<ElementType>();
    }
-    LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters)
+    LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters, 
                             const LearningRatesPerSample& learningRates,
                             bool allocateSmoothGradients /* = true */)
        : Learner(parameters),
-        m_learningRatePerSample(0.0),
+        m_learningRates(learningRates),
-        m_sampleCount(0)
+        m_sampleCount(0),
        m_minibatchCount(0)
    {
-        const unordered_set<Parameter>& parameterSet = parameters;
+        for (const auto& parameter : parameters)
        for (const auto& parameter : parameterSet)
        {
-            // TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
+            if (!allocateSmoothGradients)
            // Should the device be specified on the per-parameter basis?
            NDArrayViewPtr view;
            if (parameter.GetDataType() == DataType::Float)
            {
-                view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), parameter.Value()->Device());
+                continue;
            }
-            else
+                
-            {
+            NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape());
                view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), parameter.Value()->Device());
            }
            m_smoothedGradientValues.insert(make_pair(parameter, view));
            m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
        }
    }
-    void LearnerBase::ResetSmoothedGradients()
+    /*static*/ NDArrayViewPtr LearnerBase::AllocateNDArrayView(const Parameter& parameter, const NDShape& shape) 
    {
-        for (const auto& parameter : Parameters())
+        if (parameter.GetDataType() == DataType::Float)
        {
-            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            return MakeSharedObject<NDArrayView>(float(0.0), shape, parameter.Value()->Device());
-            const auto& data = smoothedGradientValue;
+        }
-            switch (data->GetDataType())
+        else
-            {
+        {
-            case DataType::Float:
+            return MakeSharedObject<NDArrayView>(0.0, shape, parameter.Value()->Device());
-                data->SetValue(0.0f);
+        }
-                break;
+    }
-            case DataType::Double:
+
-                data->SetValue(0.0);
+    /*static*/ NDShape LearnerBase::GetMatrixShape(const Parameter& parameter)
-                break;
+    {
-            default:
+        if (parameter.GetDataType() == DataType::Float)
-                LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
+        {
-            }
+           auto matrix = GetMatrix<float>(parameter.Value());
           return { matrix->GetNumRows(), matrix->GetNumCols() };
        }
        else
        {
           auto matrix = GetMatrix<double>(parameter.Value());
           return { matrix->GetNumRows(), matrix->GetNumCols() };
        }
    }
@ -219,17 +219,19 @@ namespace CNTK
 #endif
 #if DUMPOUTPUT
            auto learningRate = ElementType(m_learningRates[m_sampleCount]);
            auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
            LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
-                        m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
+                        learningRate, momentum, trainingSampleCount);
            LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
-                        LearnerType().c_str(), m_GaussianNoiseInjectStd);
+                      LearnerType().c_str(), m_additionalOptions.gaussianNoiseInjectionStdDev);
            Print(gradientValue, "Gradient Update");
            Print(smoothedGradientValue, "Smoothed Gradient Input");
 #endif
            UPDATE_FUNCTION;
 #if DUMPOUTPUT
-            Print(parameterValue, "Parameter Update");
+            Print(parameter.Value(), "Parameter Update");
 #endif
 #ifdef _DEBUG
@ -239,6 +241,7 @@ namespace CNTK
 #endif
        }
        m_sampleCount += trainingSampleCount;
        m_minibatchCount++;
        return false;
    }
@ -265,9 +268,16 @@ namespace CNTK
    /*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
    {
        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
        Dictionary checkpoint;
        checkpoint[L"checkpointVersion"] = checkpointVersion;
        checkpoint[L"sampleCount"] = m_sampleCount;
        checkpoint[L"minibatchCount"] = m_minibatchCount;
        // TODO: should we also save learning rate schedule into the checkpoint?
        // If that is the case, need to be able to override this method in subclasses
        // and save momentum schedule as well.
        for (const auto& parameter : Parameters())
        {
            // TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
@ -277,31 +287,48 @@ namespace CNTK
            {
                LogicError("Parameter names must be unique");
            }
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
-            // Potentially, could store things like dimensions, element size, format, etc., but
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
-            // that seems to be redundant, since all of that is passed in the constructor.
+            checkpoint[parameter.Name()] = *smoothedGradientValue;
            checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue);
        }
        return checkpoint;
    }
    /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
    {
-        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        m_sampleCount = checkpoint[L"sampleCount"].GetValue<size_t>();
        m_minibatchCount = checkpoint[L"minibatchCount"].GetValue<size_t>();
        size_t version = checkpoint[L"minibatchCount"].GetValue<size_t>();
        if (checkpointVersion != version)
        {
            // At the moment, we only support one version, so this should never happen.
            LogicError("Unsupported checkpoint version.");
        }
        for (const auto& parameter : Parameters())
        {
            if (!checkpoint.Contains(parameter.Name()))
            {
                LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
            }
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
            const NDArrayView& checkpointedValue = checkpoint[parameter.Name()].GetValue<NDArrayView>();
            if (smoothedGradientValue->GetDataType() != checkpointedValue.GetDataType())
            {
                LogicError("A value restored from a checkpoint for the smoothed gradient data type for parameter %ls does not match the expected value",
                           parameter.Name().c_str());
            }
-            const DictionaryValue& state = checkpoint[parameter.Name()];
+            if (smoothedGradientValue->Shape() != checkpointedValue.Shape())
            {
                LogicError("A value restored from a checkpoint for the smoothed gradient shape for parameter %ls does not match the expected value",
                           parameter.Name().c_str());
            }
-            const auto& data = smoothedGradientValue;
+            smoothedGradientValue->CopyFrom(checkpointedValue);
            DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
        }
    }
@ -313,23 +340,25 @@ namespace CNTK
    template <typename ElementType>
    void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
        UNUSED(trainingSampleCount);
        const auto& parameterValue = parameter.Value();
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
-        const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
        auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
-                                            learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
+                                           learningRate, momentum, m_useNesterovAcceleration);
    }
-    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
+    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, 
-        : LearnerBase(parameters), m_needAveMultiplier(needAveMultiplier)
+                                   const LearningRatesPerSample& learningRates,
                                   bool needAveMultiplier)
        : LearnerBase(parameters, learningRates), 
        m_needAveMultiplier(needAveMultiplier)
    {
    }
@ -348,15 +377,23 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
        auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }
-    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters)
+    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters, 
-        : LearnerMomentumSGD(parameters)
+                                       const LearningRatesPerSample& learningRates, 
                                       const MomentumsPerSample& momentums)
        : LearnerMomentumSGD(parameters, learningRates, momentums, /*allocateSmoothGradients*/ false)
    {
        for (const auto& parameter : parameters)
        {  
            auto shape = GetMatrixShape(parameter);
            NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], 2 * shape[1]});
            m_smoothedGradientValues.insert(make_pair(parameter, view));
        }
    }
    /*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
@ -373,21 +410,33 @@ namespace CNTK
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
-
+        
-        //const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
-
+        auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix, learningRate, momentum);
        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
                                            learningRate, ElementType(m_momentumPerSample));
    }
-    LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters,
+    LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates,
-                                    double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
+                                   double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
-                                    : LearnerBase(parameters),
+                                   : LearnerBase(parameters, learningRates, /*allocateSmoothGradients*/ false),
-                                    m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
+                                   m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
-                                    m_needAveMultiplier(needAveMultiplier)
+                                   m_needAveMultiplier(needAveMultiplier)
    {
        for (const auto& parameter : parameters)
        {  
            // When needAveMultiplier == true, CPU and GPU implementations of RMSProp require different number of columns.
            // TODO: verify that this is correct.
            size_t factor = 3;
            if (needAveMultiplier && parameter.Value()->Device().Type() == DeviceKind::GPU)
            {
                factor = 4;
            }
            auto shape = GetMatrixShape(parameter);
            NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], factor * shape[1]});
            m_smoothedGradientValues.insert(make_pair(parameter, view));
        }
    }
    /*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
@ -405,12 +454,12 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
-        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
        auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
-                                                                ElementType(m_gamma), ElementType(m_inc),
+                                                             ElementType(m_gamma), ElementType(m_inc),
-                                                                ElementType(m_max), ElementType(m_dec),
+                                                             ElementType(m_max), ElementType(m_dec),
-                                                                ElementType(m_min), m_needAveMultiplier);
+                                                             ElementType(m_min), m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }
@ -418,34 +467,35 @@ namespace CNTK
    template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr& arrayView);
    template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr& arrayView);
-    LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, double learningRatePerSample)
+    LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates)
    {
-        return MakeSharedObject<LearnerSGD>(parameters, learningRatePerSample);
+        return MakeSharedObject<LearnerSGD>(parameters, learningRates);
    }
-    LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerMomentumSGD>(parameters);
+        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRates, momentums);
    }
-    LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerNesterov>(parameters);
+        return MakeSharedObject<LearnerNesterov>(parameters, learningRates, momentums);
    }
-    LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
+    LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, bool needAveMultiplier)
    {
-        return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier);
+        return MakeSharedObject<LearnerAdaGrad>(parameters, learningRates, needAveMultiplier);
    }
-    LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters)
+    LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates, const MomentumsPerSample& momentums)
    {
-        return MakeSharedObject<LearnerFSAdaGrad>(parameters);
+        return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRates, momentums);
    }
-    LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters,
+    LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters, const LearningRatesPerSample& learningRates,
-                                double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
+                              double gamma, double inc, double dec, double max, double min, 
                              bool needAveMultiplier)
    {
-        return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier);
+        return MakeSharedObject<LearnerRMSProp>(parameters, learningRates, gamma, inc, dec, max, min, needAveMultiplier);
    }
 }
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -9,6 +9,7 @@
 namespace CNTK 
 {
    // TODO: Move this to Trainer along with Pre-, PostProcess and ClipGradient.
    // A collection of additional options that are applicable for all standard learners 
    // (after these options are set, they retain their value for the entire lifespan of a learner).
    struct AdditionalLearningOptions
@ -18,7 +19,6 @@ namespace CNTK
        double gaussianNoiseInjectionStdDev = 0.0;
        bool gradientClippingWithTruncation = true;
        double gradientClippingThresholdPerSample = std::numeric_limits<double>::infinity();
        std::unordered_map<Parameter, double> learningRateMultipliers;
    };
    // An abstract base class at the root of the standard learners hierarchy
@ -33,32 +33,16 @@ namespace CNTK
        virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override final;
        void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
        {
            m_additionalOptions = additionalOptions;
        }
        // TODO: should this be called ResetMomentum?
        // needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
        void ResetSmoothedGradients();
        // TODO: move learning rate and momentum scheduling and adjustment functionality 
        // inside the learner and drop these setters.
        void SetLearningRate(double value) { m_learningRatePerSample = value; }
    protected:
-        LearnerBase(const std::unordered_set<Parameter>& parameters);
+        LearnerBase(const std::unordered_set<Parameter>& parameters, 
                    const LearningRatesPerSample& learningRates,
                    bool allocateSmoothGradients = true);
        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const = 0;
        double ParameterDependentLearningRate(const Parameter& parameter) const
        {
            return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
        }
        std::string LearnerType() const;
-        double m_learningRatePerSample;
+        LearningRatesPerSample m_learningRates;
        AdditionalLearningOptions m_additionalOptions;
@ -91,6 +75,16 @@ namespace CNTK
        template <typename ElementType>
        void PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;
        // Returns an NDArrayView with the required shape, with the same data type as parameter value
        // and allocated on the same device.
        static NDArrayViewPtr AllocateNDArrayView(const Parameter& parameter, const NDShape& shape);
        // Retrieves the shape of the matrix corresponding to the parameter value.
        static NDShape GetMatrixShape(const Parameter& parameter);
        size_t m_sampleCount;
        size_t m_minibatchCount;
    private:
        // Templatized update function, it invokes preprocess and postprocess using the provided
        // template parameter and also invokes virtual Update method implemented in one of the subclasses.
@ -101,18 +95,20 @@ namespace CNTK
        static bool HasNan(const NDArrayViewPtr& value, const char* name);
        static void Print(const NDArrayViewPtr& value, const char* msg);
-        size_t m_sampleCount;
+        static const size_t checkpointVersion = 1;
    };
    // Vanilla gradient descent optimization algorithm.
    class LearnerSGD : public LearnerBase
    {
    public:
-        LearnerSGD(const std::unordered_set<Parameter>& parameters, double learningRatePerSample = 0)
+        LearnerSGD(const std::unordered_set<Parameter>& parameters, 
-            : LearnerBase(parameters), m_momentumPerSample(0.0), m_useNesterovAcceleration(false)
+                   const LearningRatesPerSample& learningRates, 
-        {
+                   bool allocateSmoothGradients = true)
-            SetLearningRate(learningRatePerSample);
+            : LearnerBase(parameters, learningRates, allocateSmoothGradients), 
-        }
+            m_momentums(0.0), 
            m_useNesterovAcceleration(false)
        { }
    protected:
@ -121,7 +117,8 @@ namespace CNTK
        template <typename ElementType>
        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
-        double m_momentumPerSample;
+        // TODO: Move m_momentums to LearnerMomentumSGD as soon as NormalGrad is refactored.
        MomentumsPerSample m_momentums;
        bool m_useNesterovAcceleration;
    };
@ -129,20 +126,25 @@ namespace CNTK
    class LearnerMomentumSGD : public LearnerSGD
    {
    public:
-        LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters)
+        LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters, 
-            : LearnerSGD(parameters)
+                           const LearningRatesPerSample& learningRates,
-        {}
+                           const MomentumsPerSample& momentums,
-
+                           bool allocateSmoothGradients = true)
-        void SetMomentum(double value) { m_momentumPerSample = value; }
+            : LearnerSGD(parameters, learningRates, allocateSmoothGradients)
        {
            m_momentums = momentums;
        }
    };
    // Nesterov's accelerated SGDLearnerBase descent. 
-    class LearnerNesterov : public LearnerSGD
+    class LearnerNesterov : public LearnerMomentumSGD
    {
    public:
-        LearnerNesterov(const std::unordered_set<Parameter>& parameters)
+        LearnerNesterov(const std::unordered_set<Parameter>& parameters, 
-            : LearnerSGD(parameters)
+                        const LearningRatesPerSample& learningRates,
                        const MomentumsPerSample& momentums)
            : LearnerMomentumSGD(parameters, learningRates, momentums)
        {
            m_useNesterovAcceleration = true;
        }
@ -152,7 +154,9 @@ namespace CNTK
    {
    public:
-        LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier);
+        LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, 
                       const LearningRatesPerSample& learningRates,
                       bool needAveMultiplier);
    protected:
        bool m_needAveMultiplier;
@ -167,7 +171,9 @@ namespace CNTK
    {
    public:
-        LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters);
+        LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters,
                         const LearningRatesPerSample& learningRates,
                         const MomentumsPerSample& momentums);
    protected:
@ -182,7 +188,9 @@ namespace CNTK
    public:
        LearnerRMSProp(const std::unordered_set<Parameter>& parameters,
-                       double gamma, double inc, double dec, double max, double min, bool needAveMultiplier);
+                       const LearningRatesPerSample& learningRates,
                       double gamma, double inc, double dec, double max, double min,
                       bool needAveMultiplier);
    protected:
--- a/Source/CNTKv2LibraryDll/MinibatchSource.cpp
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.cpp
@ -49,10 +49,12 @@ namespace CNTK
            m_streamInfos.insert({ streamDesc->m_name, streamDesc->m_id, AsStorageFormat(streamDesc->m_storageType), AsDataType(streamDesc->m_elementType), AsNDShape(*(streamDesc->m_sampleLayout)) });
    }
-    /*virtual*/ std::unordered_map<StreamInfo, MinibatchData> CompositeMinibatchSource::GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+    /*virtual*/ const std::unordered_map<StreamInfo, MinibatchData>&
-                                                                                                         const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/) /*override*/
+    CompositeMinibatchSource::GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
                                               const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/) /*override*/
    {
-        std::unordered_map<StreamInfo, MinibatchData> minibatchData;
+        m_minibatchData.clear();
        if (!m_epochEndReached)
        {
            // TODO: Support different minibatch sizes for different streams
@ -117,7 +119,9 @@ namespace CNTK
                auto currentStreamMinibatchData = compositeReaderMinibatchData.m_data[i];
                if (currentStreamDesc->m_elementType == ElementType::tfloat)
                {
-                    auto dataMatrix = std::make_shared<Matrix<float>>(CPUDEVICE);
+                    auto CNTKMatrixType = (currentStreamDesc->m_storageType == StorageType::dense) ? DENSE : SPARSE;
                    auto CNTKMatrixFormat = (currentStreamDesc->m_storageType == StorageType::dense) ? matrixFormatDense : matrixFormatSparseCSC;
                    auto dataMatrix = std::make_shared<Matrix<float>>(0, 0, CPUDEVICE, CNTKMatrixType, CNTKMatrixFormat);
                    size_t sampleSize = currentStreamDesc->m_sampleLayout->GetNumElements();
                    // TODO: Eliminate the unnecessary CPU to CPU copy
@ -127,14 +131,14 @@ namespace CNTK
                    size_t numSamples = currentStreamMinibatchData->m_layout->GetActualNumSamples();
                    size_t numSequences = currentStreamMinibatchData->m_layout->GetNumSequences();
-                    minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
+                    m_minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
                }
                else
                    LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
            }
        }
-        return minibatchData;
+        return m_minibatchData;
    }
    void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
--- a/Source/CNTKv2LibraryDll/MinibatchSource.h
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.h
@ -19,8 +19,8 @@ namespace CNTK
        virtual const std::unordered_set<StreamInfo>& StreamInfos() override { return m_streamInfos; }
-        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+        virtual const std::unordered_map<StreamInfo, MinibatchData>& GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
-                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) override;
+                                                                                      const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) override;
    private: 
        std::unordered_set<StreamInfo> m_streamInfos;
@ -28,5 +28,6 @@ namespace CNTK
        bool m_epochEndReached;
        size_t m_prevMinibatchSize;
        size_t m_epochSize;
        std::unordered_map<StreamInfo, MinibatchData> m_minibatchData;
    };
 }
--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -61,11 +61,12 @@ namespace CNTK
                    LogicError("The gradient value for a Parameter cannot have an associated mask!");
            }
-            auto trainingLossArguments = m_trainingLossVar.Owner()->Arguments();
+            auto trainingLossArgument = *(m_trainingLossVar.Owner()->Arguments().begin());
-            auto labelsVar = *(std::find_if(trainingLossArguments.begin(), trainingLossArguments.end(), [](const Variable& var) {
+
-                return var.IsInput();
+            // Find the argument whose dynamic axes match the criterion operation's dynamic axes (i.e. label dynamic axes)
-            }));
+            auto argumentValue = std::find_if(arguments.begin(), arguments.end(), [trainingLossArgument](const std::pair<Variable, ValuePtr>& currentPair) {
-            auto argumentValue = arguments.at(labelsVar);
+                return (currentPair.first.DynamicAxes() == trainingLossArgument.DynamicAxes());
            })->second;
            auto argumentData = argumentValue->Data();
            auto argumentDataShape = argumentData->Shape();
            auto mask = argumentValue->Mask();
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -6,20 +6,100 @@
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include "Utils.h"
-#include "File.h"
+#include <istream>
 #include <ostream>
 using namespace std;
 namespace CNTK
 {
    // This wrapper redefines operator<< in terms of unformatted (binary) write operation.
    struct BinaryOStreamWrapper
    {
        BinaryOStreamWrapper(ostream& s) : m_stream(s) {}
        template<typename T>
        typename std::enable_if<std::is_pod<T>::value, BinaryOStreamWrapper&>::type
        operator<<(const T& value)
        { 
            m_stream.write(reinterpret_cast<const char*>(&value), sizeof(T)); 
            return *this ; 
        }
        BinaryOStreamWrapper& operator<<(const wstring& str)
        { 
            *this << str.length();
            m_stream.write(reinterpret_cast<const char*>(str.c_str()), str.length() * sizeof(wchar_t)); 
            return *this; 
        }
        operator ostream& () { return m_stream; }
        ostream& m_stream;
        BinaryOStreamWrapper(const BinaryOStreamWrapper&) = delete; BinaryOStreamWrapper(BinaryOStreamWrapper&&) = delete; BinaryOStreamWrapper& operator=(const BinaryOStreamWrapper&) = delete; BinaryOStreamWrapper& operator=(BinaryOStreamWrapper&&) = delete;
    };
    // This wrapper redefines operator>> in terms of unformatted (binary) read operation.
    struct BinaryIStreamWrapper
    {
        BinaryIStreamWrapper(istream& s) : m_stream(s) {}
        template<typename T>
        typename std::enable_if<std::is_pod<T>::value, BinaryIStreamWrapper&>::type
        operator>>(T& value)
        { 
            static_assert(sizeof(T) <= sizeof(size_t), "size_t is the largest supported type.");
            m_stream.read(buf, sizeof(T)); 
            value = *(reinterpret_cast<T*>(buf));
            return *this ; 
        }
        BinaryIStreamWrapper& operator>>(wstring& str)
        { 
            size_t length;
            *this >> length;
            str.resize(length);
            for (size_t i = 0; i < length; ++i)
            {
                m_stream.read(buf, sizeof(wchar_t)); 
                str[i] = *(reinterpret_cast<wchar_t*>(buf));
            }
            return *this; 
        }
        operator istream& () const { return m_stream ;}
        istream& m_stream;
        char buf[sizeof(size_t)];
        BinaryIStreamWrapper(const BinaryIStreamWrapper&) = delete; BinaryIStreamWrapper(BinaryIStreamWrapper&&) = delete; BinaryIStreamWrapper& operator=(const BinaryIStreamWrapper&) = delete; BinaryIStreamWrapper& operator=(BinaryIStreamWrapper&&) = delete;
    };
    template <typename T>
    T* CreateDataPtr(const T& value)
    {
        return new T(value);
    }
    template <>
    NDArrayView* CreateDataPtr<NDArrayView>(const NDArrayView& value)
    {
        // TODO: replace this copy with an alias to value.
        NDArrayView* viewPtr = new NDArrayView(value.GetDataType(), value.Shape(), DeviceDescriptor::CPUDevice());
        viewPtr->CopyFrom(value);
        return viewPtr;
    }
    template <typename T>
    void DictionaryValue::AllocateDataPtr(const T& value)
    {
        static_assert(is_same<T, NDShape>::value ||
                      is_same<T, wstring>::value ||
                      is_same<T, vector<DictionaryValue>>::value ||
-                      is_same<T, Dictionary>::value, "AllocateDataPtr called with invalid type");
+                      is_same<T, Dictionary>::value ||
-        m_data.m_ptr = new T(value);
+                      is_same<T, NDArrayView>::value,
                      "AllocateDataPtr called with invalid type");
        m_data.m_ptr = CreateDataPtr<T>(value);
    }
    template <typename T>
@ -31,12 +111,163 @@ namespace CNTK
        m_data.m_ptr = nullptr;
    }
-    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
+    template <typename ElementType> 
    bool AreEqual(NDArrayView& view1, NDArrayView& view2)
    {
        if (view1.GetDataType() != view2.GetDataType() ||
            view1.Shape() != view2.Shape())
        {
            return false;
        }
        ElementType* data1 = nullptr;
        ElementType* data2 = nullptr;
        if (view1.Device().Type() == DeviceKind::CPU)
        {
            data1 = view1.WritableDataBuffer<ElementType>();
            data2 = view2.WritableDataBuffer<ElementType>();
        }
        else
        {
            NDArrayViewPtr temp1CpuDataView = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), view1.Shape(), DeviceDescriptor::CPUDevice());
            temp1CpuDataView->CopyFrom(view1);
            data1 = temp1CpuDataView->WritableDataBuffer<ElementType>();
            NDArrayViewPtr temp2CpuDataView = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), view2.Shape(), DeviceDescriptor::CPUDevice());
            temp2CpuDataView->CopyFrom(view2);
            data2 = temp2CpuDataView->WritableDataBuffer<ElementType>();
        }
        size_t numElements = view1.Shape().TotalSize();
        for (size_t i = 0; i < numElements; ++i)
        {
            if (data1[i] != data2[i])
            {
                return false;
            }
        }
        return true;
    }
    bool DictionaryValue::operator==(const DictionaryValue& other) const
    {
        if (this == &other)
        {
            return true;
        }
        if (m_valueType != other.m_valueType)
        {
            return false;
        }
        switch (m_valueType)
        {
        case DictionaryValue::Type::Bool:
            return (m_data.m_boolean == other.m_data.m_boolean);
        case DictionaryValue::Type::SizeT:
            return (m_data.m_sizeT == other.m_data.m_sizeT);
        case DictionaryValue::Type::Float:
            return (m_data.m_float == other.m_data.m_float);
        case DictionaryValue::Type::Double:
            return (m_data.m_double == other.m_data.m_double);
        case DictionaryValue::Type::String:
        {
            wstring* strPtr1 = reinterpret_cast<wstring*>(m_data.m_ptr);
            wstring* strPtr2 = reinterpret_cast<wstring*>(other.m_data.m_ptr);
            return (*strPtr1 == *strPtr2);
        }
        case DictionaryValue::Type::NDShape:
        {
            NDShape* shapePtr1 = reinterpret_cast<NDShape*>(m_data.m_ptr);
            NDShape* shapePtr2 = reinterpret_cast<NDShape*>(other.m_data.m_ptr);
            return (*shapePtr1 == *shapePtr2);
        }
        case DictionaryValue::Type::Vector:
        {   
            vector<DictionaryValue>* vectorPtr1 = reinterpret_cast<vector<DictionaryValue>*>(m_data.m_ptr);
            vector<DictionaryValue>* vectorPtr2 = reinterpret_cast<vector<DictionaryValue>*>(other.m_data.m_ptr);
            return (*vectorPtr1 == *vectorPtr2);
        }
        case DictionaryValue::Type::Dictionary:
        {
            Dictionary* dictPtr1 = reinterpret_cast<Dictionary*>(m_data.m_ptr);
            Dictionary* dictPtr2 = reinterpret_cast<Dictionary*>(other.m_data.m_ptr);
            return (*dictPtr1 == *dictPtr2);
        }
        case DictionaryValue::Type::NDArrayView:
        {
            NDArrayView* viewPtr1 = reinterpret_cast<NDArrayView*>(m_data.m_ptr);
            NDArrayView* viewPtr2 = reinterpret_cast<NDArrayView*>(other.m_data.m_ptr);
            switch (viewPtr1->GetDataType())
            {
            case DataType::Float:
                return AreEqual<float>(*viewPtr1, *viewPtr2);
            case DataType::Double:
                return AreEqual<double>(*viewPtr1, *viewPtr2);
            default:
                NOT_IMPLEMENTED;
            }
        }
        default:
            NOT_IMPLEMENTED;
        }
    }
    bool DictionaryValue::operator!=(const DictionaryValue& other) const
    {
        return !(*this == other);    
    }
    BinaryOStreamWrapper& operator<<(BinaryOStreamWrapper& stream, const NDShape& us)
    {
        auto size = us.NumAxes();
        stream << size;
        for (auto i = 0; i < size; i++)
        {
            stream << us[i];
        }
        return stream;
    }
    template <typename T>
    void Write(BinaryOStreamWrapper& stream, const NDArrayView& view)
    {
        assert(view.Device().Type() == DeviceKind::CPU);
        auto numElements = view.Shape().TotalSize();
        const T* buffer = view.DataBuffer<T>();
        for (auto i = 0; i < numElements; ++i)
        {
            stream << buffer[i];
        }
    }
    template <typename T>
    void Read(BinaryIStreamWrapper& stream, NDArrayView& view)
    {
        assert(view.Device().Type() == DeviceKind::CPU);
        auto numElements = view.Shape().TotalSize();
        T* buffer = view.WritableDataBuffer<T>();
        for (auto i = 0; i < numElements; ++i)
        {
            stream >> buffer[i];
        }
    }
    istream& operator>>(istream& stdStream, DictionaryValue& us)
    {
        BinaryIStreamWrapper stream(stdStream);
        size_t version;
        stream >> version;
-
+        
-        stream >> us.m_valueType;
+        unsigned int type;
        stream >> type;
        us.m_valueType = static_cast<DictionaryValue::Type>(type);
        switch (us.ValueType())
        {
@ -52,28 +283,72 @@ namespace CNTK
        case DictionaryValue::Type::Double:
            stream >> us.m_data.m_double;
            break;
        case DictionaryValue::Type::String:
        {
            wstring* strPtr = new wstring();
            stream >> *strPtr;
            us.m_data.m_ptr = strPtr;
            break;
        }
        case DictionaryValue::Type::NDShape:
        {
            size_t size;
            stream >> size;
-            vector<size_t> dims(size);
+            NDShape* shapePtr = new NDShape(size);
            for (auto i = 0; i < size; i++)
            {
-                stream >> dims[i];
+                stream >> shapePtr->operator[](i);
            }
-            us.AllocateDataPtr(NDShape(dims));
+            us.m_data.m_ptr = shapePtr;
            break;
        }
        case DictionaryValue::Type::Vector:
-        {
+        {   
            size_t size;
            stream >> size;
-            vector<DictionaryValue> values(size);
+            vector<DictionaryValue>* vectorPtr = new vector<DictionaryValue>(size);
            for (auto i = 0; i < size; i++)
            {
-                stream >> values[i];
+                stream >> vectorPtr->at(i);
            }
-            us.AllocateDataPtr(values);
+            us.m_data.m_ptr = vectorPtr;
            break;
        }
        case DictionaryValue::Type::Dictionary:
        {
            Dictionary* dictPtr = new Dictionary();
            stream >> *dictPtr;
            us.m_data.m_ptr = dictPtr;
            break;
        }
        case DictionaryValue::Type::NDArrayView:
        {
            unsigned int type;
            stream >> type;
            DataType dtype = static_cast<DataType>(type);
            size_t size;
            stream >> size;
            NDShape shape(size);
            for (auto i = 0; i < size; i++)
            {
                stream >> shape[i];
            }
            NDArrayView* viewPtr = new NDArrayView(dtype, shape, DeviceDescriptor::CPUDevice());
            switch (dtype)
            {
            case DataType::Float:
                Read<float>(stream, *viewPtr);
                break;
            case DataType::Double:
                Read<double>(stream, *viewPtr);
                break;
            default:
                LogicError("Unsupported DataType %s", DataTypeName(dtype));
            }
            us.m_data.m_ptr = viewPtr;
            break;
        }
        default:
@ -82,11 +357,13 @@ namespace CNTK
        return stream;
    }
-    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
+    ostream& operator<<(ostream& stdStream, const DictionaryValue& us)
    {
        BinaryOStreamWrapper stream(stdStream);
        stream << us.version;
-        stream << us.ValueType();
+        stream << static_cast<unsigned int>(us.ValueType());
        switch (us.ValueType())
        {
@ -102,15 +379,16 @@ namespace CNTK
        case DictionaryValue::Type::Double:
            stream << us.m_data.m_double;
            break;
        case DictionaryValue::Type::String:
        {
            wstring* stringPtr = reinterpret_cast<wstring*>(us.m_data.m_ptr);
            stream << *stringPtr;
            break;
        }
        case DictionaryValue::Type::NDShape:
        {
            NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
-            auto size = shapePtr->NumAxes();
+            stream << *shapePtr;
            stream << size;
            for (auto i = 0; i < size; i++)
            {
                stream << shapePtr->operator[](i);
            }
            break;
        }
        case DictionaryValue::Type::Vector:
@ -121,7 +399,31 @@ namespace CNTK
            stream << size;
            for (auto i = 0; i < size; i++)
            {
-                stream << vectorPtr->operator[](i);
+                stream << vectorPtr->at(i);
            }
            break;
        }
        case DictionaryValue::Type::Dictionary:
        {
            Dictionary* dictPtr = reinterpret_cast<Dictionary*>(us.m_data.m_ptr);
            stream << *dictPtr;
            break;
        }
        case DictionaryValue::Type::NDArrayView:
        {
            NDArrayView* viewPtr = reinterpret_cast<NDArrayView*>(us.m_data.m_ptr);
            stream << static_cast<unsigned int>(viewPtr->GetDataType());
            stream << viewPtr->Shape();
            switch (viewPtr->GetDataType())
            {
            case DataType::Float:
                Write<float>(stream, *viewPtr);
                break;
            case DataType::Double:
                Write<double>(stream, *viewPtr);
                break;
            default:
                LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
            }
            break;
        }
@ -148,7 +450,7 @@ namespace CNTK
    Dictionary& Dictionary::operator=(const Dictionary& other)
    {
        assert(this != &other);
-        m_dictionaryData.reset(new std::unordered_map<std::wstring, DictionaryValue>(*(other.m_dictionaryData)));
+        m_dictionaryData.reset(new unordered_map<wstring, DictionaryValue>(*(other.m_dictionaryData)));
        return *this;
    }
@ -183,20 +485,51 @@ namespace CNTK
        return (m_dictionaryData->find(key) != m_dictionaryData->end());
    }
-    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
+    bool Dictionary::operator==(const Dictionary& other) const
    {
        if (this == &other)
        {
            return true;
        }
        if (m_dictionaryData->size() != other.m_dictionaryData->size())
        {
            return false;
        }
        for (auto& kv : *m_dictionaryData)
        {
            auto result = other.m_dictionaryData->find(kv.first);
            if (result == other.m_dictionaryData->end() || kv.second != result->second)
            {
                return false;
            }
        }
        return true;
    }
    bool Dictionary::operator!=(const Dictionary& other) const
    {
        return !(*this == other);    
    }
    ostream& operator<<(ostream& stdStream, const Dictionary& us)
    {
        BinaryOStreamWrapper stream(stdStream);
        stream << us.version;
        stream << us.m_dictionaryData->size();
-        for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
+        for (auto& kv : *(us.m_dictionaryData))
        {
-            stream << it->first;
+            stream << kv.first;
-            stream << it->second;
+            stream << kv.second;
        }
        return stream;
    }
-    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
+    istream& operator>>(istream& stdStream, Dictionary& us)
    {
        BinaryIStreamWrapper stream(stdStream);
        size_t version;
        stream >> version;
        size_t size;
@ -206,113 +539,36 @@ namespace CNTK
        {
            wstring key;
            stream >> key;
-            DictionaryValue value;
+            stream >> us[key];
            stream >> value;
            us.m_dictionaryData->insert(make_pair(key, value));
        }
        return stream;
    }
    // Returns the element whose key is greater than the required sample count 
    // or the last element if no such key exists.
    template <typename T>
-    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    const T& TrainingParameterSchedule<T>::operator[](size_t sampleCount) const
    {
-        if (viewPtr->IsSparse())
+        assert(m_schedule.size() > 0);
        auto it = m_schedule.upper_bound(sampleCount);
        if (it == m_schedule.end())
        {
-            LogicError("Sparse NDArrayView cannot be serialized into a vector.");
+            --it;
        }
-
+        return it->second;
        auto numElements = viewPtr->Shape().TotalSize();
        vector<DictionaryValue> values(numElements);
        NDArrayViewPtr cpuDataViewPtr = viewPtr;
        if ((viewPtr->Device().Type() != DeviceKind::CPU))
        {
            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
            cpuDataViewPtr->CopyFrom(*viewPtr);
        }
        const T* buffer = cpuDataViewPtr->DataBuffer<T>();
        for (auto i = 0; i < numElements; ++i)
        {
            T v = buffer[i];
            values[i] = DictionaryValue(v);
        }
        return values;
    }
    template <typename T>
    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
    {
        if (viewPtr->IsSparse())
        {
            LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
        }
        auto numElements = viewPtr->Shape().TotalSize();
        if (values.size() != numElements)
        {
            LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
                        values.size(), numElements);
        }
        NDArrayViewPtr cpuDataViewPtr = viewPtr;
        if ((viewPtr->Device().Type() != DeviceKind::CPU))
        {
            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
        }
        T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
        for (auto i = 0; i < numElements; ++i)
        {
            buffer[i] = values[i].GetValue<T>();
        }
        if ((viewPtr->Device().Type() != DeviceKind::CPU))
        {
            viewPtr->CopyFrom(*cpuDataViewPtr);
        }
    }
    // TODO: we store the type info for every element in the vector, which is extremely redundant.
    // Instead, it'd be nice to introduce some sort of DictionaryValueVector.
    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
    {
        switch (viewPtr->GetDataType())
        {
        case DataType::Float:
            return SerializeToVector<float>(viewPtr);
        case DataType::Double:
            return SerializeToVector<double>(viewPtr);
        default:
            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
        }
    }
    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values) 
    {
        switch (viewPtr->GetDataType())
        {
        case DataType::Float:
            DeserializeFromVector<float>(viewPtr, values);
            break;
        case DataType::Double:
            DeserializeFromVector<double>(viewPtr, values);
            break;
        default:
            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
        }
    }
    template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
    template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
    template void DictionaryValue::AllocateDataPtr<wstring>(const wstring& value);
    template void DictionaryValue::AllocateDataPtr<Dictionary>(const Dictionary& value);
    template void DictionaryValue::AllocateDataPtr<NDArrayView>(const NDArrayView& value);
    template void DictionaryValue::FreePtrAsType<NDShape>();
    template void DictionaryValue::FreePtrAsType<vector<DictionaryValue>>();
    template void DictionaryValue::FreePtrAsType<wstring>();
    template void DictionaryValue::FreePtrAsType<Dictionary>();
    template void DictionaryValue::FreePtrAsType<NDArrayView>();
    template const double& TrainingParameterSchedule<double>::operator[](size_t key) const;
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -167,9 +167,6 @@ namespace CNTK
        return var.IsInput() && var.IsSparse();
    }
    std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
    inline void AddIndentation(std::wstringstream& s, size_t numIndentationSpaces)
    {
@ -250,7 +247,8 @@ namespace CNTK
        static_assert(std::is_same<T, bool>::value ||
                      std::is_same<T, size_t>::value ||
                      std::is_same<T, float>::value ||
-                      std::is_same<T, double>::value, "Unsupported ValueType");
+                      std::is_same<T, double>::value ||
                      std::is_same<T, std::wstring>::value, "Unsupported ValueType");
        std::vector<DictionaryValue> dictionaryValueVector;
        for (auto value : basicElementTypeVector)
@ -265,7 +263,8 @@ namespace CNTK
        static_assert(std::is_same<T, bool>::value ||
            std::is_same<T, size_t>::value ||
            std::is_same<T, float>::value ||
-            std::is_same<T, double>::value, "Unsupported ValueType");
+            std::is_same<T, double>::value ||
            std::is_same<T, std::wstring>::value, "Unsupported ValueType");
        std::vector<T> basicElementTypeVector;
        for (auto value : dictionaryValueVector)
@ -313,4 +312,19 @@ namespace CNTK
        return{ paddedOutputMapCount, kernelShape };
    }
    inline CNTK::Constant ScalarConstant(CNTK::DataType dataType, float value, const CNTK::DeviceDescriptor& device = CNTK::DeviceDescriptor::CPUDevice())
    {
        if (dataType == CNTK::DataType::Float)
            return CNTK::Constant({}, value, device);
        else if (dataType == CNTK::DataType::Double)
            return CNTK::Constant({}, (double)value, device);
        else
            LogicError("CNTK::ScalarConstant: Unsupported DataType %s", DataTypeName(dataType));
    }
    inline double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
    {
        return std::pow(momentumPerSample, minibatchSize);
    }
 }
--- a/Source/CNTKv2LibraryDll/Variable.cpp
+++ b/Source/CNTKv2LibraryDll/Variable.cpp
@ -7,6 +7,8 @@
 namespace CNTK
 {
    /*static*/ const std::vector<Axis> Variable::s_defaultInputVariableDynamicAxes = { Axis::DefaultDynamicAxis(), Axis::DefaultBatchAxis() };
    Variable::Variable(const FunctionPtr& function)
        : Variable(function->Output())
    {
--- a/Source/Common/Include/Platform.h
+++ b/Source/Common/Include/Platform.h
@ -11,6 +11,13 @@
 #define __UNIX__
 #endif
 #ifdef _MSC_VER
 // TODO: thread_local is supported in VS2015. Remove this macro when we uprade to VS2015
 #define THREAD_LOCAL __declspec(thread)
 #else
 #define THREAD_LOCAL thread_local
 #endif
 // ===========================================================================
 // compiler differences
 // ===========================================================================
--- a/Source/Common/Include/RandomOrdering.h
+++ b/Source/Common/Include/RandomOrdering.h
@ -11,6 +11,7 @@
 #include <stdio.h>
 #include <vector>
 #include <algorithm>
 #include <random>
 namespace Microsoft { namespace MSR { namespace CNTK {
@ -24,6 +25,31 @@ static inline size_t rand(const size_t begin, const size_t end)
    return begin + randno % (end - begin);
 }
 // Rand based on Mersenne Twister.
 // We use our own distribution in order to match baselines between different operating systems,
 // because uniform_distribution is not guranteed to provide the same numbers on different platforms.
 // TODO: Switching to Boost would eliminate this problem.
 static inline size_t RandMT(const size_t begin, const size_t end, std::mt19937_64& rng)
 {
    const size_t randomNumber = rng();
    return begin + randomNumber % (end - begin);
 }
 // Rand based on Mersenne Twister.
 // We use our own distribution in order to match baselines between different operating systems,
 // instead of using std::shuffle which uses unitform_distribution internally.
 // TODO: Switching to Boost would eliminate this problem.
 template <typename TVector>
 inline void RandomShuffleMT(TVector& v, std::mt19937_64& rng)
 {
    foreach_index(currentLocation, v)
    {
        // Pick a random location a location and swap with current
        const size_t randomLocation = RandMT(0, v.size(), rng);
        std::swap(v[currentLocation], v[randomLocation]);
    }
 }
 class RandomOrdering // note: NOT thread-safe at all
 {
    // constants for randomization
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -258,13 +258,20 @@ public:
        m_evalOrders[rootNode] = nodes;
    }
    bool EvalOrderExists(const ComputationNodeBasePtr& rootNode) const
    {
        return m_evalOrders.find(rootNode) != m_evalOrders.end();
    }
    // get depth-first traversal order
    // TODO: This is currently not immutable because it gets patched w.r.t. recurrent loops. Ideally we don't patch. Need to review and verify that it is sufficient.
    const std::list<ComputationNodeBasePtr>& GetEvalOrder(const ComputationNodeBasePtr& rootNode) const
    {
        auto iter = m_evalOrders.find(rootNode);
        if (iter == m_evalOrders.end())
        {
            LogicError("GetEvalOrder: Called without prior call to FormEvalOrder() for %ls %ls operation", rootNode->NodeName().c_str(), rootNode->OperationName().c_str());
        }
        return iter->second;
    }
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@ -76,6 +76,9 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
    ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
    if (!fromNet.EvalOrderExists(fromRoot))
        const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
    for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
    {
        wstring fromNodeName = fromNode->NodeName();
@ -353,6 +356,9 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
    else
    {
        // for calculating a specific node
        if (!EvalOrderExists(rootNode))
            const_cast<ComputationNetwork&>(*this).FormEvalOrder(rootNode);
        for (const auto& node : GetAllNodesForRoot(rootNode))
        {
            if (node->OperationName() == OperationNameOf(LearnableParameter))
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -32,15 +32,16 @@
 #define CNTK_MODEL_VERSION_1 1
 #define CNTK_MODEL_VERSION_2 2
 #define CNTK_MODEL_VERSION_3 3
-#define CNTK_MODEL_VERSION_4 4 // PastValue
+#define CNTK_MODEL_VERSION_4 4   // PastValue
-#define CNTK_MODEL_VERSION_5 5 // ND convolution and pooling
+#define CNTK_MODEL_VERSION_5 5   // ND convolution and pooling
-#define CNTK_MODEL_VERSION_6 6 // batch-norm blending
+#define CNTK_MODEL_VERSION_6 6   // batch-norm blending
-#define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
+#define CNTK_MODEL_VERSION_7 7   // ElemType tag in model file
-#define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
+#define CNTK_MODEL_VERSION_8 8   // DynamicAxis for inputs
-#define CNTK_MODEL_VERSION_9 9 // transpose flag in ConvolutionNode to support deconvolution
+#define CNTK_MODEL_VERSION_9 9   // transpose flag in ConvolutionNode to support deconvolution
 #define CNTK_MODEL_VERSION_10 10 // learning-rate multiplier for input nodes
-#define CNTK_MODEL_VERSION_11 11 // Times() m_inputRank to support parameter-rank inference
+#define CNTK_MODEL_VERSION_11 11 // dynamic axis name for where nodes
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_11
+#define CNTK_MODEL_VERSION_12 12 // Times() m_inputRank to support parameter-rank inference
 #define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_12
 extern bool g_shareNodeValueMatrices;
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -365,6 +365,7 @@ public:
        TensorShape outputShape;
        // If 2D convolution syntax is used then some of the tensor dimensions need to be inferred.
        if (m_convolution2D)
        // NOTE: when m_convolution2D is true, it's a legacy branch. Code should not enter here any more. 
        {
            // Need to update some tensors with correct input dims.
            auto inDims = ImageDimensions(GetInputSampleLayout(inputIdx), m_imageLayout);
@ -396,6 +397,8 @@ public:
            outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
                                                                m_sharing, m_autoPad, m_lowerPad, m_upperPad);
            // ConvolveGeometry always uses CHW.
            SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
        }
        else
        {
@ -414,9 +417,12 @@ public:
                outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
            }
            if (m_imageLayout == ImageLayoutKind::CHW) 
                SetDims(outputShape, HasMBLayout());
            else    // legacy format 
                SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
        }
        // ConvolveGeometry always uses CHW.
        SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
        // update LearnableParameter if it has 0 dimensions (to be inferred)
        // Typically this would be the #inputChannels (C).
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -450,9 +450,9 @@ public:
                assert(dimsA.size() == m_outputRank + numReductionDims);
                while (numReductionDims < inputRank)
                {
-                    dimsA.push_back(0);
+                dimsA.push_back(0);
-                    numReductionDims++;
+                numReductionDims++;
-                }
+            }
            }
            // fill in the missing ones
@ -561,8 +561,8 @@ class TransposeTimesNode : public TimesNodeBase<ElemType, true>
 public:
    DeclareConstructorFromConfigWithNumInputs(TransposeTimesNode);
-    TransposeTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
+    TransposeTimesNode(DEVICEID_TYPE deviceId, const wstring& name, size_t outputRank = 1)
-        : Base(deviceId, name, /*outputRank=*/1, /*inputRank=*/1)
+        : Base(deviceId, name, outputRank, /*inputRank=*/1)
    {
    }
 };
@ -665,6 +665,9 @@ public:
            m_axis1 = 1, m_axis2 = 2; // default
    }
    int Axis1() const { return m_axis1; }
    int Axis2() const { return m_axis2; }
 private:
    // compute the transposed tensor shape (in-place)
    void TransposeShape(TensorShape& shape) const
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -300,7 +300,7 @@ template <class ElemType>
    if (!m_pMBLayout)
    {
        m_pMBLayout = make_shared<MBLayout>(); // this generates a new layout
-        m_pMBLayout->SetUniqueAxisName(L"WhereNodeAxis");
+        m_pMBLayout->SetUniqueAxisName(m_dynamicAxisName);
    }
    // we map scalars to scalars
    if (isFinalValidationPass && Input(0)->GetSampleLayout().GetNumElements() != 1)
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -217,6 +217,9 @@ public:
    virtual bool /*ComputationNodeBase::*/ InputUsedInComputingInputNodesGradients(size_t childIndex) const override;
    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override;
    std::wstring ReductionOpName() const { return m_operation; }
    int ReductionAxis() const { return m_axis; }
 private:
    // operation attributes
    int m_axis;
@ -341,11 +344,12 @@ public:
        fstream << m_axis;
    }
 private:
    // these implement numpy-style negative bound values to index from the end
    size_t BeginIndex() const { return m_beginIndex >= 0 ? (size_t)m_beginIndex : (size_t)(m_beginIndex + Input(0)->GetSampleLayout()[m_axis - 1]); }
-    size_t EndIndex()   const { return m_endIndex   >  0 ? (size_t)m_endIndex   : (size_t)(m_endIndex   + Input(0)->GetSampleLayout()[m_axis - 1]); }
+    size_t EndIndex()   const { return m_endIndex   >  0 ? (size_t)m_endIndex : (size_t)(m_endIndex + Input(0)->GetSampleLayout()[m_axis - 1]); }
    int Axis() const { return m_axis; }
 private:
    // determine the tensor shape that represents slice of the input that we are taking
    TensorShape GetInputSlice(size_t rank, const FrameRange & fr) const
@ -655,10 +659,11 @@ class WhereNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1
    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
    static const std::wstring TypeName() { return L"Where"; }
    static const std::wstring DefaultWhereNodeDynamicAxisName() { return L"WhereNodeAxis"; }
 public:
    DeclareConstructorFromConfigWithNumInputs(WhereNode);
-    WhereNode(DEVICEID_TYPE deviceId, const wstring& name) :
+    WhereNode(DEVICEID_TYPE deviceId, const wstring& name, const wstring& dynamicAxisName = DefaultWhereNodeDynamicAxisName()) :
-        Base(deviceId, name)
+        Base(deviceId, name), m_dynamicAxisName(dynamicAxisName)
    {
        MarkValueNonSharable();
    }
@ -669,11 +674,29 @@ public:
    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
    virtual void Validate(bool isFinalValidationPass) override;
    virtual void Load(File& fstream, size_t modelVersion) override
    {
        Base::Load(fstream, modelVersion);
        if (modelVersion >= CNTK_MODEL_VERSION_11)
            fstream >> m_dynamicAxisName;
        else
            m_dynamicAxisName = DefaultWhereNodeDynamicAxisName();
    }
    virtual void Save(File& fstream) const override
    {
        Base::Save(fstream);
        fstream << m_dynamicAxisName;
    }
    std::wstring DynamicAxisName() const { return m_dynamicAxisName; }
 private:
    // buffers for creating the result sequences (kept as object state to avoid memory allocations)
    std::vector<std::vector<size_t>>   m_indexSequenceBuffer; // [sequenceIndex][t] for creating the result sequences
    std::vector<size_t>               m_rowAllocationsBuffer; // [row] for determining new MBLayout packing
    std::vector<std::pair<size_t, size_t>> m_placementBuffer; // [sequenceIndex] assigned location for a sequence
    std::wstring m_dynamicAxisName;
 };
 // -----------------------------------------------------------------------
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -37,14 +37,8 @@
 #pragma warning(disable : 4244) // unreachable code; triggered for unknown reasons
 #pragma warning(disable : 4702) // conversion from 'double' to 'float'
-#ifdef USE_ACML
+
-// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
+#ifdef USE_MKL
 // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
 // Install the ifort64_mp variant (compiled with intel compiler) of the library
 // Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
 // to point to your folder for the include file and link library
 #include <acml.h> // requires ACML 5.3.1 and above
 #elif defined(USE_MKL)
 // requires MKL 10.0 and above
 #include <mkl.h>
 #else
@ -57,12 +51,6 @@
 #include <lapacke.h>
 #endif
 #ifdef USE_ACML // MKL has one additional parameter for different matrix order
 #define BLAS_COLMAJOR
 #else
 #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
 #endif
 #define SWAP(a, b)  \
    {               \
        (a) ^= (b); \
@ -912,11 +900,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
 #pragma omp parallel for
                    foreach_column (j, us)
                    {
 #ifdef USE_ACML
                        dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(bufPtr + LocateColumn(j)), 1);
 #else
                        cblas_dcopy((int) numRows, reinterpret_cast<double*>(pArray + j), (int) numCols, reinterpret_cast<double*>(bufPtr + LocateColumn(j)), 1);
 #endif
                    }
                }
                else
@ -926,11 +910,7 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
                    {
                        {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
                            scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(bufPtr + LocateColumn(j)), 1);
 #else
                            cblas_scopy((int) numRows, reinterpret_cast<float*>(pArray + j), (int) numCols, reinterpret_cast<float*>(bufPtr + LocateColumn(j)), 1);
 #endif
                        }
                    }
                }
@ -2844,20 +2824,12 @@ ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
    if (sizeof(ElemType) == sizeof(double))
    {
 #ifdef USE_ACML
        return (ElemType) dasum((int) GetNumElements(), reinterpret_cast<double*>(Data()), 1);
 #else
        return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast<double*>(Data()), 1);
 #endif
    }
    else
    {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
        return sasum((int) GetNumElements(), reinterpret_cast<float*>(Data()), 1);
 #else
        return cblas_sasum((int) GetNumElements(), reinterpret_cast<float*>(Data()), 1);
 #endif
    }
 }
@ -3028,11 +3000,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
            foreach_column (j, c)
            {
 #ifdef USE_ACML
                c(0, j) = (ElemType) dnrm2(m, reinterpret_cast<double*>(bufPtr + us.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast<double*>(bufPtr + us.LocateColumn(j)), 1);
 #endif
            }
        }
        else
@ -3041,11 +3009,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
                c(0, j) = snrm2(m, reinterpret_cast<float*>(bufPtr + us.LocateColumn(j)), 1);
 #else
                c(0, j) = cblas_snrm2(m, reinterpret_cast<float*>(bufPtr + us.LocateColumn(j)), 1);
 #endif
            }
        }
    }
@ -3058,11 +3022,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
 #pragma omp parallel for
            foreach_row (i, c)
            {
 #ifdef USE_ACML
                c(i, 0) = dnrm2(n, reinterpret_cast<double*>(bufPtr + i), m);
 #else
                c(i, 0) = cblas_dnrm2(n, reinterpret_cast<double*>(bufPtr + i), m);
 #endif
            }
        }
        else
@ -3071,11 +3031,7 @@ void CPUMatrix<ElemType>::VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWi
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
                c(i, 0) = snrm2(n, reinterpret_cast<float*>(bufPtr + i), m);
 #else
                c(i, 0) = cblas_snrm2(n, reinterpret_cast<float*>(bufPtr + i), m);
 #endif
            }
        }
    }
@ -4486,34 +4442,22 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
    int m, n, k, l;
    int lda, ldb, ldc;
 #ifdef USE_ACML
    char transA, transB;
 #else
    CBLAS_TRANSPOSE mklTransA;
    CBLAS_TRANSPOSE mklTransB;
 #endif
    if (transposeA)
    {
        m = (int) a.GetNumCols();
        k = (int) a.GetNumRows();
        lda = k;
 #ifdef USE_ACML
        transA = (char) MatrixTranspose::Trans;
 #else
        mklTransA = CBLAS_TRANSPOSE::CblasTrans;
 #endif
    }
    else
    {
        m = (int) a.GetNumRows();
        k = (int) a.GetNumCols();
        lda = m;
 #ifdef USE_ACML
        transA = (char) MatrixTranspose::NoTrans;
 #else
        mklTransA = CBLAS_TRANSPOSE::CblasNoTrans;
 #endif
    }
    if (transposeB)
@ -4521,22 +4465,14 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
        l = (int) b.GetNumCols();
        n = (int) b.GetNumRows();
        ldb = n;
 #ifdef USE_ACML
        transB = (char) MatrixTranspose::Trans;
 #else
        mklTransB = CBLAS_TRANSPOSE::CblasTrans;
 #endif
    }
    else
    {
        l = (int) b.GetNumRows();
        n = (int) b.GetNumCols();
        ldb = l;
 #ifdef USE_ACML
        transB = (char) MatrixTranspose::NoTrans;
 #else
        mklTransB = CBLAS_TRANSPOSE::CblasNoTrans;
 #endif
    }
    assert(m > 0 && k > 0 && l > 0 && n > 0); // converting from size_t to int may cause overflow
@ -4553,20 +4489,12 @@ void CPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix
    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
+        cblas_dgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
        dgemm(transA, transB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
 #else
        cblas_dgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<double*>(a.Data()), lda, reinterpret_cast<double*>(b.Data()), ldb, beta, reinterpret_cast<double*>(c.Data()), ldc);
 #endif
    }
    else
    {
 #pragma warning(suppress : 4244)
-#ifdef USE_ACML
+        cblas_sgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
        sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
 #else
        cblas_sgemm((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast<float*>(a.Data()), lda, reinterpret_cast<float*>(b.Data()), ldb, beta, reinterpret_cast<float*>(c.Data()), ldc);
 #endif
    }
 }
@ -4611,9 +4539,7 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
    if (sizeof(ElemType) == sizeof(double))
    {
-#ifdef USE_ACML
+#ifdef USE_MKL
        dgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), (int) ldu, reinterpret_cast<double*>(VT.Data()), (int) ldvt, &info);
 #elif defined(USE_MKL)
        double wkopt;
        int lwork = -1;
        dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.Data()), &lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), &ldu, reinterpret_cast<double*>(VT.Data()), &ldvt, &wkopt, &lwork, &info);
@ -4622,16 +4548,13 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
        dgesvd("All", "All", &m, &n, reinterpret_cast<double*>(A.Data()), &lda, reinterpret_cast<double*>(SIGMA.Data()), reinterpret_cast<double*>(U.Data()), &ldu, reinterpret_cast<double*>(VT.Data()), &ldvt, reinterpret_cast<double*>(W.Data()), &lwork, &info);
 #else
        std::vector<double> superb(std::max(std::min(m, n) - 1, 1));
-        info = LAPACKE_dgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()),
+        info = LAPACKE_dgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast<double*>(A.Data()), (int) lda, reinterpret_cast<double*>(SIGMA.Data()),
            reinterpret_cast<double*>(U.Data()), (int) ldu, reinterpret_cast<double*>(VT.Data()), (int) ldvt, &superb[0]);
 #endif
    }
    else
    {
-#ifdef USE_ACML
+#ifdef USE_MKL
 #pragma warning(suppress : 4244)
        sgesvd('A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), (int) ldu, reinterpret_cast<float*>(VT.Data()), (int) ldvt, &info);
 #elif defined(USE_MKL)
        float wkopt;
        int lwork = -1;
        sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.Data()), &lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), &ldu, reinterpret_cast<float*>(VT.Data()), &ldvt, &wkopt, &lwork, &info);
@ -4640,7 +4563,7 @@ void CPUMatrix<ElemType>::SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>&
        sgesvd("All", "All", &m, &n, reinterpret_cast<float*>(A.Data()), &lda, reinterpret_cast<float*>(SIGMA.Data()), reinterpret_cast<float*>(U.Data()), &ldu, reinterpret_cast<float*>(VT.Data()), &ldvt, reinterpret_cast<float*>(W.Data()), &lwork, &info);
 #else
        std::vector<float> superb(std::max(std::min(m, n) - 1, 1));
-        info = LAPACKE_sgesvd(BLAS_COLMAJOR 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()),
+        info = LAPACKE_sgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast<float*>(A.Data()), (int) lda, reinterpret_cast<float*>(SIGMA.Data()),
            reinterpret_cast<float*>(U.Data()), (int) ldu, reinterpret_cast<float*>(VT.Data()), (int) ldvt, &superb[0]);
 #endif
    }
@ -4837,20 +4760,12 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
        if (sizeof(ElemType) == sizeof(double))
        {
 #ifdef USE_ACML
            daxpy(len, alpha, reinterpret_cast<double*>(a.Data()), incx, reinterpret_cast<double*>(c.Data()), incy);
 #else
            cblas_daxpy(len, alpha, reinterpret_cast<double*>(a.Data()), incx, reinterpret_cast<double*>(c.Data()), incy);
 #endif
        }
        else
        {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
            saxpy(len, alpha, reinterpret_cast<float*>(a.Data()), incx, reinterpret_cast<float*>(c.Data()), incy);
 #else
            cblas_saxpy(len, alpha, reinterpret_cast<float*>(a.Data()), incx, reinterpret_cast<float*>(c.Data()), incy);
 #endif
        }
    }
    else if (a.GetNumElements() == 1) // scalar, add to all elements
@ -4889,11 +4804,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
            foreach_column (j, c)
            {
 #ifdef USE_ACML
                daxpy(m, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + c.LocateColumn(j)), 1);
 #else
                cblas_daxpy(m, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + c.LocateColumn(j)), 1);
 #endif
            }
        }
        else
@ -4902,11 +4813,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
                saxpy(m, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + c.LocateColumn(j)), 1);
 #else
                cblas_saxpy(m, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + c.LocateColumn(j)), 1);
 #endif
            }
        }
    }
@ -4925,11 +4832,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
 #pragma omp parallel for
            foreach_row (i, c)
            {
 #ifdef USE_ACML
                daxpy(n, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + i), m);
 #else
                cblas_daxpy(n, alpha, reinterpret_cast<double*>(aBufPtr), 1, reinterpret_cast<double*>(cBufPtr + i), m);
 #endif
            }
        }
        else
@ -4938,11 +4841,7 @@ void CPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>&
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
                saxpy(n, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + i), m);
 #else
                cblas_saxpy(n, alpha, reinterpret_cast<float*>(aBufPtr), 1, reinterpret_cast<float*>(cBufPtr + i), m);
 #endif
            }
        }
    }
@ -5163,20 +5062,12 @@ template <class ElemType>
    }
    else if (sizeof(ElemType) == sizeof(double))
    {
 #ifdef USE_ACML
        dscal(len, alpha, reinterpret_cast<double*>(a.Data()), incx); // TODO: Use overloads.
 #else
        cblas_dscal(len, alpha, reinterpret_cast<double*>(a.Data()), incx);
 #endif
    }
    else
    {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
        sscal(len, alpha, reinterpret_cast<float*>(a.Data()), incx);
 #else
        cblas_sscal(len, alpha, reinterpret_cast<float*>(a.Data()), incx);
 #endif
    }
 }
@ -5224,11 +5115,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
            foreach_column (j, c)
            {
 #ifdef USE_ACML
                c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
 #endif
            }
        }
        else
@ -5237,11 +5124,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
            foreach_column (j, c)
            {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
                c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
 #endif
            }
        }
    }
@ -5256,11 +5139,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
 #pragma omp parallel for
            foreach_row (i, c)
            {
 #ifdef USE_ACML
                c(i, 0) = ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
 #else
                c(i, 0) = cblas_ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
 #endif
            }
        }
        else
@ -5269,11 +5148,7 @@ void CPUMatrix<ElemType>::InnerProduct(const CPUMatrix<ElemType>& a, const CPUMa
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
                c(i, 0) = sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
 #else
                c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
 #endif
            }
        }
    }
@ -5298,20 +5173,12 @@ ElemType CPUMatrix<ElemType>::InnerProductOfMatrices(const CPUMatrix<ElemType>&
    if (sizeof(ElemType) == sizeof(double))
    {
 #ifdef USE_ACML
        return (ElemType) ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.Data()), 1, reinterpret_cast<double*>(b.Data()), 1);
 #else
        return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast<double*>(a.Data()), 1, reinterpret_cast<double*>(b.Data()), 1);
 #endif
    }
    else
    {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
        return (ElemType) sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.Data()), 1, reinterpret_cast<float*>(b.Data()), 1);
 #else
        return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast<float*>(a.Data()), 1, reinterpret_cast<float*>(b.Data()), 1);
 #endif
    }
 }
@ -5539,21 +5406,13 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
        {
            for (long j = 0; j < n; j++)
            {
 #ifdef USE_ACML
                c(0, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn(j)), 1);
 #endif
            }
            for (long j = 0; j < n; j++)
            {
                for (long i = 1; i < negnumber + 1; i++)
                {
 #ifdef USE_ACML
                    c(i, j) = (ElemType) ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
 #else
                    c(i, j) = (ElemType) cblas_ddot(m, reinterpret_cast<double*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<double*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
 #endif
                }
            }
        }
@ -5561,21 +5420,13 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
        {
            for (long j = 0; j < n; j++)
            {
 #ifdef USE_ACML
                c(0, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
 #else
                c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn(j)), 1);
 #endif
            }
            for (long j = 0; j < n; j++)
            {
                for (long i = 1; i < negnumber + 1; i++)
                {
 #ifdef USE_ACML
                    c(i, j) = (ElemType) sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
 #else
                    c(i, j) = (ElemType) cblas_sdot(m, reinterpret_cast<float*>(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast<float*>(bBufPtr + b.LocateColumn((j + shift + i - 1) % n)), 1);
 #endif
                }
            }
        }
@ -5593,11 +5444,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
 #pragma omp parallel for
            foreach_row (i, c)
            {
 #ifdef USE_ACML
                c(i, 0) = (ElemType) ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
 #else
                c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast<double*>(aBufPtr + i), m, reinterpret_cast<double*>(bBufPtr + i), m);
 #endif
            }
        }
        else
@ -5606,11 +5453,7 @@ void CPUMatrix<ElemType>::InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a,
            foreach_row (i, c)
            {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
                c(i, 0) = sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
 #else
                c(i, 0) = cblas_sdot(n, reinterpret_cast<float*>(aBufPtr + i), m, reinterpret_cast<float*>(bBufPtr + i), m);
 #endif
            }
        }
    }
@ -6025,13 +5868,11 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)
    omp_set_num_threads(numThreads);
    numThreads = omp_get_max_threads();
-#ifdef USE_ACML
+    #ifdef USE_MKL
-    acmlsetnumthreads(numThreads);
+        mkl_set_num_threads(numThreads);
-#elif defined(USE_MKL)
+    #elif defined(USE_OPENBLAS)
-    mkl_set_num_threads(numThreads);
+        openblas_set_num_threads(numThreads);
-#elif defined(USE_OPENBLAS)
+    #endif
    openblas_set_num_threads(numThreads);
 #endif
 #endif
    return numThreads;
 }
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -23,15 +23,7 @@
 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
-#ifdef USE_ACML
+#ifdef USE_MKL
 // use ACML as default.
 // Download ACML 5.3.0 (e.g., acml5.3.0-ifort64.exe) or above
 // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
 // Install the ifort64 variant (compiled with intel compiler) of the library
 // Set Environment variable ACML_PATH to C:\AMD\acml5.3.0\ifort64_mp or the folder you installed acml
 // to point to your folder for the include file and link library
 #include <acml.h> // requires ACML 5.3.0 and above
 #elif defined(USE_MKL)
 // requires MKL 10.0 and above
 #include <mkl.h>
 #else
@ -53,12 +45,6 @@
 //    return 42;
 //}
 #ifdef USE_ACML // MKL has one additional parameter for different matrix order
 #define BLAS_COLMAJOR
 #else
 #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
 #endif
 // TODO: Move to CommonMatrix.h
 #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
@ -261,11 +247,23 @@ void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
    RequireSizeAndAllocate(v.GetNumRows(), v.GetNumCols(), v.NzSize());
    let nz = v.NzCount();
    auto matrixFormat = v.GetFormat();
    if (((matrixFormat == matrixFormatSparseBlockCol) || (matrixFormat == matrixFormatSparseBlockRow)) && (v.GetBlockIdShift() > 0))
        NOT_IMPLEMENTED;
    if (nz > 0)
    {
        memcpy(NzValues(),    v.NzValues(),    v.NzSize());
-        memcpy(RowLocation(), v.RowLocation(), v.RowSize());
+
-        memcpy(ColLocation(), v.ColLocation(), v.ColSize());
+        if ((matrixFormat == matrixFormatSparseCSC) || (matrixFormat == matrixFormatSparseCSR))
        {
            memcpy(RowLocation(), v.RowLocation(), v.RowSize());
            memcpy(ColLocation(), v.ColLocation(), v.ColSize());
        }
        else
        {
            memcpy(GetBlockIds(), v.GetBlockIds(), v.GetBlockSize());
        }
    }
    if (v.m_sliceViewOffset > 0)
    {
@ -384,6 +382,66 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::DoGatherColumnsOf(ElemType
    return *this;
 }
 // *this[:,idx[j]] = a[:,j] * alpha + *this[:,idx[j]] * beta
 template <class ElemType>
 CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const CPUMatrix<ElemType>& idx, const CPUSparseMatrix<ElemType>& a, ElemType alpha)
 {
    VerifyWritable(__func__);
    if ((a.GetFormat() != matrixFormatSparseCSC) || (GetFormat() != matrixFormatSparseCSC))
        NOT_IMPLEMENTED;
    if (idx.GetNumRows() != 1) // index is 1-dimensional only
        InvalidArgument("DoScatterColumnsOf: Map must be a row vector.");
    if (beta != 0)
        NOT_IMPLEMENTED;
    if (NzCount() != 0)
        InvalidArgument("CPUSparseMatrix::DoScatterColumnsOf: The target matrix cannot have pre-existing non-zero values when being scattered into");
    size_t numNonZeroElements = a.NzCount();
    if (beta == 0)
        RequireSizeAndAllocate(GetNumRows(), GetNumCols(), numNonZeroElements);
    // Setup the Secondary index
    std::vector<int> columnElementCounts(GetNumCols(), 0);
    size_t numColsToWrite = idx.GetNumCols();
    for (long j = 0; j < numColsToWrite; j++)
    {
        auto jOutF = idx(0, j); // this is the column we need to write to
        if (::isnan(jOutF) || (jOutF < 0))     // negative index means gap
            continue;
        size_t jOut = (size_t)jOutF;
        columnElementCounts[jOut] = a.SecondaryIndexLocation()[j + 1] - a.SecondaryIndexLocation()[j];
    }
    // TODO: Replace with std::exclusive_scan when we switch to C++17
    for (size_t i = 1; i <= GetNumCols(); ++i)
        SecondaryIndexLocation()[i] = SecondaryIndexLocation()[i - 1] + columnElementCounts[i - 1];
    size_t offset = a.SecondaryIndexLocation()[0];
    // TODO: Does it make sense to parallelize this?
    for (long j = 0; j < numColsToWrite; j++)
    {
        auto jOutF = idx(0, j); // this is the column we need to write to
        if (::isnan(jOutF) || (jOutF < 0))     // negative index means gap
            continue;
        size_t jOut = (size_t)jOutF;
        auto start = SecondaryIndexLocation()[jOut];
        auto end = SecondaryIndexLocation()[jOut + 1];
        for (auto p = start; p < end; p++, offset++)
        {
            GetUnCompIndex()[p] = a.GetUnCompIndex()[offset];
            Buffer()[p] = a.Buffer()[offset] * alpha;
        }
    }
    return *this;
 }
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::Print(const char* matrixName) const
 {
@ -587,13 +645,7 @@ void CPUSparseMatrix<ElemType>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYP
 }
 template <class ElemType>
-ElemType* CPUSparseMatrix<ElemType>::Data() const
+ElemType* CPUSparseMatrix<ElemType>::Data()  const
 {
    return Buffer() + GetCompIndex()[m_sliceViewOffset];
 }
 template <class ElemType>
 ElemType* CPUSparseMatrix<ElemType>::Data() 
 {
    return (Buffer() + 
        ((GetFormat() == matrixFormatSparseCSC || GetFormat() == matrixFormatSparseCSR) ? GetCompIndex()[m_sliceViewOffset] : 0));
@ -1340,20 +1392,12 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
    if (sizeof(ElemType) == sizeof(double))
    {
 #ifdef USE_ACML
        return (ElemType) dasum((int) this->NzCount(), reinterpret_cast<double*>(Data()), 1);
 #else
        return (ElemType) cblas_dasum((int) this->NzCount(), reinterpret_cast<double*>(Data()), 1);
 #endif
    }
    else
    {
 #pragma warning(suppress : 4244)
 #ifdef USE_ACML
        return sasum((int) this->NzCount(), reinterpret_cast<float*>(Data()), 1);
 #else
        return cblas_sasum((int) this->NzCount(), reinterpret_cast<float*>(Data()), 1);
 #endif
    }
 }
@ -1495,7 +1539,6 @@ template void CPUSparseMatrix<char>::SetValue(size_t, size_t, char);
 template void CPUSparseMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
 //template void CPUSparseMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
 template char* CPUSparseMatrix<char>::Data() const;
 template char* CPUSparseMatrix<char>::Data();
 template void CPUSparseMatrix<char>::Reset(void);
 template void CPUSparseMatrix<char>::Resize(const size_t, const size_t, const size_t, const bool);
 template void CPUSparseMatrix<char>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, bool);
@ -1518,7 +1561,6 @@ template void CPUSparseMatrix<short>::SetValue(size_t, size_t, short);
 template void CPUSparseMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
 //template void CPUSparseMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
 template short* CPUSparseMatrix<short>::Data() const;
 template short* CPUSparseMatrix<short>::Data();
 template void CPUSparseMatrix<short>::Reset(void);
 template void CPUSparseMatrix<short>::Resize(const size_t, const size_t, const size_t, const bool);
 template void CPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, bool);
--- a/Source/Math/CPUSparseMatrix.h
+++ b/Source/Math/CPUSparseMatrix.h
@ -92,13 +92,13 @@ public:
    void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val);
    CPUSparseMatrix<ElemType>& DoGatherColumnsOf(ElemType beta, const CPUMatrix<ElemType>& idx, const CPUSparseMatrix<ElemType>& a, ElemType alpha);
    CPUSparseMatrix<ElemType>& DoScatterColumnsOf(ElemType beta, const CPUMatrix<ElemType>& idx, const CPUSparseMatrix<ElemType>& a, ElemType alpha);
    size_t BufferSize() const
    {
        return GetSizeAllocated() * sizeof(ElemType);
    }
    ElemType* Data() const;
    ElemType* Data();
    inline size_t GetNumElemAllocated() const
    {
        return GetSizeAllocated();
@ -262,7 +262,8 @@ public:
    CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const
    {
-        return GetUnCompIndex() + GetCompIndex()[m_sliceViewOffset];
+        return (GetUnCompIndex() + 
            ((GetFormat() == matrixFormatSparseCSC || GetFormat() == matrixFormatSparseCSR) ? GetCompIndex()[m_sliceViewOffset] : 0));
    } // this is the major index, row/col ids in CSC/CSR format
    size_t MajorIndexCount() const
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -237,7 +237,7 @@ std::pair<size_t, size_t> TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(
 // deviceId - the device on which the operation will take place
 void PrepareDevice(DEVICEID_TYPE deviceId)
 {
-    static DEVICEID_TYPE currentDevice = DEVICEID_NOTYETDETERMINED;
+    THREAD_LOCAL static DEVICEID_TYPE currentDevice = DEVICEID_NOTYETDETERMINED;
    // and if we last set the device to be this device we are good
    if (deviceId == currentDevice)
        return;
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -227,6 +227,5 @@
  <Target Name="CheckDependencies">
    <Error Condition="'$(MathLibrary)' == 'MKL' And '$(CNTK_MKL_PATH)' == ''" Text="CNTK custom MKL location not specified, see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
    <Error Condition="'$(MathLibrary)' == 'MKL' And !Exists('$(CNTKCustomMKLPath)')" Text="CNTK custom MKL not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
    <Error Condition="'$(MathLibrary)' == 'ACML' And !Exists('$(ACML_PATH)')" Text="ACML not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#acml for instructions." />
  </Target>
 </Project>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -1081,7 +1081,7 @@ Matrix<ElemType>& Matrix<ElemType>::DoScatterColumnsOf(ElemType beta, const Matr
    DISPATCH_MATRIX_ON_FLAG(&a, this,
        { m_CPUMatrix->DoScatterColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUMatrix, alpha); },
        { m_GPUMatrix->DoScatterColumnsOf(beta, *idx.m_GPUMatrix, *a.m_GPUMatrix, alpha); },
-        { NOT_IMPLEMENTED; },
+        { m_CPUSparseMatrix->DoScatterColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUSparseMatrix, alpha); },
        { NOT_IMPLEMENTED; });
    return *this;
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@ -377,8 +377,8 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
            // second, remove trailing slash if there is any
            // TODO: when gcc -v is 4.9 or greater, this should be: std::regex_replace(rootpath, L"\\/+$", wstring());
-            size_t stringPos = 0;
+            int stringPos = 0;
-            for (stringPos = rootpath.length() - 1; stringPos >= 0; stringPos--) 
+            for (stringPos = (int) (rootpath.length() - 1); stringPos >= 0; stringPos--) 
            {
                if (rootpath[stringPos] != L'/')
                {
@ -517,11 +517,11 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
        m_lattices->setverbosity(m_verbosity);
        // now get the frame source. This has better randomization and doesn't create temp files
-        bool minimizeReaderMemoryFootprint = readerConfig(L"minimizeReaderMemoryFootprint", true);
+        bool useMersenneTwisterRand = readerConfig(L"useMersenneTwisterRand", false);
-        m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, 
+        m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(useMersenneTwisterRand, infilesmulti, labelsmulti, m_featDims, m_labelDims,
                                                                         numContextLeft, numContextRight, randomize, 
                                                                         *m_lattices, m_latticeMap, m_frameMode, 
-                                                                         minimizeReaderMemoryFootprint, m_expandToUtt));
+                                                                         m_expandToUtt));
        m_frameSource->setverbosity(m_verbosity);
    }
    else if (EqualCI(readMethod, L"rollingWindow"))
--- a/Source/Readers/HTKMLFReader/utterancesourcemulti.h
+++ b/Source/Readers/HTKMLFReader/utterancesourcemulti.h
@ -12,7 +12,8 @@
 #include "latticearchive.h" // for reading HTK phoneme lattices (MMI training)
 #include "minibatchsourcehelpers.h"
 #include "minibatchiterator.h"
-#include "unordered_set"
+#include <unordered_set>
 #include <random>
 namespace msra { namespace dbn {
@ -38,6 +39,10 @@ class minibatchutterancesourcemulti : public minibatchsource
    // const std::vector<unique_ptr<latticesource>> &lattices;
    const latticesource &lattices;
    // Flag indicating whether to use Mersenne Twister random generator.
    bool m_useMersenneTwister;
    std::mt19937_64 m_rng;
    // std::vector<latticesource> lattices;
    // word-level transcripts (for MMI mode when adding best path to lattices)
    const map<wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts; // (used for getting word-level transcripts)
@ -413,6 +418,7 @@ class minibatchutterancesourcemulti : public minibatchsource
        // When true we use a rolling window of randomized framerefs to minimize memory
        // footprint, instead of using a large vector listing all frames in the training corpus
        // Functionally, the 2 methods are identical.
        // When it is true, we also use Mersenne Twister random generator for randomization.
        const bool m_minimizeMemoryFootprint;
        // [globalt-sweepts] -> (chunk, utt, frame) lookup table for randomized frames  --this can be REALLY big!
@ -429,6 +435,10 @@ class minibatchutterancesourcemulti : public minibatchsource
        size_t m_currentRangeEndChunkIdx;
        size_t m_nextFramePosNotYetRandomized;
        // If m_minimizeMemoryFootprint is true, Mersenne Twister is used for randomization
        // because rand has problems in distributed case.
        std::mt19937_64 m_rng;
    public:
        framerandomizer(const std::vector<std::vector<chunk>>& randomizedChunks, bool minimizeMemoryFootprint)
            : m_randomizedChunks(randomizedChunks), m_minimizeMemoryFootprint(minimizeMemoryFootprint), m_currentRangeBeginChunkIdx(0), m_currentRangeEndChunkIdx(0), m_nextFramePosNotYetRandomized(0)
@ -496,7 +506,9 @@ class minibatchutterancesourcemulti : public minibatchsource
                for (;;) // (randomization retry loop)
                {
-                    size_t tswap = Microsoft::MSR::CNTK::rand(postbegin, postend); // random frame position within allowed range
+                    size_t tswap = m_minimizeMemoryFootprint ?
                        Microsoft::MSR::CNTK::RandMT(postbegin, postend, m_rng) :
                        Microsoft::MSR::CNTK::rand(postbegin, postend); // random frame position within allowed range
                    // We want to swap 't' to 'tswap' and 'tswap' to 't'.
                    //  - Both may have been swapped before.
                    //  - Both must stay within the randomization window of their respective position.
@ -542,11 +554,11 @@ class minibatchutterancesourcemulti : public minibatchsource
        void reset(unsigned int randSeed)
        {
            srand(randSeed);
            size_t sweepts = m_randomizedChunks[0][0].globalts;
            size_t totalFrames = m_randomizedChunks[0].back().globalte() - sweepts;
            if (m_minimizeMemoryFootprint)
            {
                m_rng.seed(randSeed);
                m_randomizedframerefsWindow.clear();
                m_currentRangeBeginChunkIdx = m_randomizedChunks[0][0].windowbegin;
                m_currentRangeEndChunkIdx = m_currentRangeBeginChunkIdx;
@ -554,6 +566,7 @@ class minibatchutterancesourcemulti : public minibatchsource
            }
            else
            {
                srand(randSeed + 1);
                if (m_randomizedframerefs.size() != totalFrames)
                    m_randomizedframerefs.resize(totalFrames);
@ -866,10 +879,11 @@ public:
    // constructor
    // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
    // This mode requires utterances with time stamps.
-    minibatchutterancesourcemulti(const std::vector<std::vector<wstring>> &infiles, const std::vector<map<wstring, std::vector<msra::asr::htkmlfentry>>> &labels,
+    minibatchutterancesourcemulti(bool useMersenneTwister, const std::vector<std::vector<wstring>> &infiles, const std::vector<map<wstring, std::vector<msra::asr::htkmlfentry>>> &labels,
                                  std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange,
-                                  const latticesource &lattices, const map<wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts, const bool framemode, bool minimizeMemoryFootprint, std::vector<bool> expandToUtt)
+                                  const latticesource &lattices, const map<wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts, const bool framemode, std::vector<bool> expandToUtt)
-                                  : vdim(vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod(0), featdim(0), randomizationrange(randomizationrange), currentsweep(SIZE_MAX), lattices(lattices), allwordtranscripts(allwordtranscripts), framemode(framemode), chunksinram(0), timegetbatch(0), verbosity(2), m_generatePhoneBoundaries(!lattices.empty()), m_frameRandomizer(randomizedchunks, minimizeMemoryFootprint), expandToUtt(expandToUtt)
+                                  : vdim(vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod(0), featdim(0), randomizationrange(randomizationrange), currentsweep(SIZE_MAX), lattices(lattices), allwordtranscripts(allwordtranscripts), framemode(framemode), chunksinram(0), timegetbatch(0), verbosity(2), m_generatePhoneBoundaries(!lattices.empty()), m_frameRandomizer(randomizedchunks, useMersenneTwister), expandToUtt(expandToUtt),
                                    m_useMersenneTwister(useMersenneTwister)
    // [v-hansu] change framemode (lattices.empty()) into framemode (false) to run utterance mode without lattice
    // you also need to change another line, search : [v-hansu] comment out to run utterance mode without lattice
    {
@ -1251,8 +1265,16 @@ private:
                randomizedchunkrefs[i].push_back(allchunks[i].begin() + j);
            assert(randomizedchunkrefs[i].size() == allchunks[i].size());
-            // note that sincew randomshuffle() uses sweep as seed, this will keep the randomization common across all feature streams
+            if (m_useMersenneTwister)
-            randomshuffle(randomizedchunkrefs[i], sweep); // bring into random order (with random seed depending on sweep)
+            {
                m_rng.seed((unsigned long)sweep);
                Microsoft::MSR::CNTK::RandomShuffleMT(randomizedchunkrefs[i], m_rng); // bring into random order (with random seed depending on sweep)
            }
            else
            {
                // note that sincew randomshuffle() uses sweep as seed, this will keep the randomization common across all feature streams
                randomshuffle(randomizedchunkrefs[i], sweep); // bring into random order (with random seed depending on sweep)
            }
        }
        // place them onto the global timeline -> randomizedchunks[]
@ -1348,7 +1370,7 @@ private:
            // check we got those setup right
            // we now randomly shuffle randomizedutterancerefs[pos], while considering the constraints of what chunk range needs to be in memory
-            srand((unsigned int) sweep + 1);
+            m_useMersenneTwister ? m_rng.seed((unsigned long)sweep) : srand((unsigned int)sweep + 1);
            for (size_t i = 0; i < randomizedutterancerefs.size(); i++)
            {
                // get valid randomization range, expressed in chunks
@ -1364,7 +1386,9 @@ private:
                for (;;)
                {
                    // pick a random location
-                    const size_t j = Microsoft::MSR::CNTK::rand(posbegin, posend); // a random number within the window
+                    const size_t j = m_useMersenneTwister ?
                        Microsoft::MSR::CNTK::RandMT(posbegin, posend, m_rng) :
                        Microsoft::MSR::CNTK::rand(posbegin, posend); // a random number within the window
                    if (i == j)
                        break; // the random gods say "this one points to its original position"... nothing wrong about that, but better not try to swap
@ -1416,7 +1440,7 @@ private:
        }
        else // frame mode
        {
-            m_frameRandomizer.reset((unsigned int)sweep + 1);
+            m_frameRandomizer.reset((unsigned int)sweep);
        }
        return sweep;
--- a/Source/Readers/ImageReader/ByteReader.h
+++ b/Source/Readers/ImageReader/ByteReader.h
@ -21,7 +21,7 @@ public:
    ByteReader() = default;
    virtual ~ByteReader() = default;
-    virtual void Register(size_t seqId, const std::string& path) = 0;
+    virtual void Register(const std::map<std::string, size_t>& sequences) = 0;
    virtual cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) = 0;
    DISABLE_COPY_AND_MOVE(ByteReader);
@ -30,7 +30,7 @@ public:
 class FileByteReader : public ByteReader
 {
 public:
-    void Register(size_t, const std::string&) override {}
+    void Register(const std::map<std::string, size_t>&) override {}
    cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) override;
 };
@ -40,7 +40,7 @@ class ZipByteReader : public ByteReader
 public:
    ZipByteReader(const std::string& zipPath);
-    void Register(size_t seqId, const std::string& path) override;
+    void Register(const std::map<std::string, size_t>& sequences) override;
    cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) override;
 private:
--- a/Source/Readers/ImageReader/ImageDataDeserializer.cpp
+++ b/Source/Readers/ImageReader/ImageDataDeserializer.cpp
@ -13,6 +13,7 @@
 #include "ImageConfigHelper.h"
 #include "StringUtil.h"
 #include "ConfigUtil.h"
 #include "TimerUtility.h"
 namespace Microsoft { namespace MSR { namespace CNTK {
@ -135,6 +136,7 @@ ImageDataDeserializer::ImageDataDeserializer(CorpusDescriptorPtr corpus, const C
    }
    string precision = (ConfigValue)config("precision", "float");
    m_verbosity = config(L"verbosity", 0);
    // Feature stream.
    ConfigParameters featureSection = inputs(featureNames[0]);
@ -144,6 +146,7 @@ ImageDataDeserializer::ImageDataDeserializer(CorpusDescriptorPtr corpus, const C
    features->m_storageType = StorageType::dense;
    features->m_elementType = AreEqualIgnoreCase(precision, "float") ? ElementType::tfloat : ElementType::tdouble;
    m_streams.push_back(features);
    m_featureElementType = features->m_elementType;
    // Label stream.
    ConfigParameters label = inputs(labelNames[0]);
@ -179,6 +182,8 @@ ImageDataDeserializer::ImageDataDeserializer(const ConfigParameters& config)
    const auto& label = m_streams[configHelper.GetLabelStreamId()];
    const auto& feature = m_streams[configHelper.GetFeatureStreamId()];
    m_verbosity = config(L"verbosity", 0);
    // Expect data in HWC.
    ImageDimensions dimensions(*feature->m_sampleLayout, configHelper.GetDataFormat());
    feature->m_sampleLayout = std::make_shared<TensorShape>(dimensions.AsTensorShape(HWC));
@ -240,9 +245,13 @@ void ImageDataDeserializer::CreateSequenceDescriptions(CorpusDescriptorPtr corpu
    size_t curId = 0;
    std::string line;
    PathReaderMap knownReaders;
    ReaderSequenceMap readerSequences;
    ImageSequenceDescription description;
    description.m_numberOfSamples = 1;
    Timer timer;
    timer.Start();
    auto& stringRegistry = corpus->GetStringRegistry();
    for (size_t lineIndex = 0; std::getline(mapFile, line); ++lineIndex)
    {
@ -296,9 +305,20 @@ void ImageDataDeserializer::CreateSequenceDescriptions(CorpusDescriptorPtr corpu
            m_keyToSequence[description.m_key.m_sequence] = m_imageSequences.size();
            m_imageSequences.push_back(description);
-            RegisterByteReader(description.m_id, description.m_path, knownReaders);
+            RegisterByteReader(description.m_id, description.m_path, knownReaders, readerSequences);
        }
    }
    for (auto& reader : knownReaders)
    {
        reader.second->Register(readerSequences[reader.first]);
    }
    timer.Stop();
    if (m_verbosity > 1)
    {
        fprintf(stderr, "ImageDeserializer: Read information about %d images in %.6g seconds\n", (int)m_imageSequences.size(), timer.ElapsedSeconds());
    }
 }
 ChunkPtr ImageDataDeserializer::GetChunk(ChunkIdType chunkId)
@ -307,7 +327,7 @@ ChunkPtr ImageDataDeserializer::GetChunk(ChunkIdType chunkId)
    return std::make_shared<ImageChunk>(sequenceDescription, *this);
 }
-void ImageDataDeserializer::RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders)
+void ImageDataDeserializer::RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders, ReaderSequenceMap& readerSequences)
 {
    assert(!path.empty());
@ -330,16 +350,19 @@ void ImageDataDeserializer::RegisterByteReader(size_t seqId, const std::string&
    {
        reader = std::make_shared<ZipByteReader>(containerPath);
        knownReaders[containerPath] = reader;
        readerSequences[containerPath] = std::map<std::string, size_t>();
    }
    else
    {
        reader = (*r).second;
    }
-    reader->Register(seqId, itemPath);
+
    readerSequences[containerPath][itemPath] = seqId;
    m_readers[seqId] = reader;
 #else
    UNUSED(seqId);
    UNUSED(knownReaders);
    UNUSED(readerSequences);
    RuntimeError("The code is built without zip container support. Only plain image files are supported.");
 #endif
 }
--- a/Source/Readers/ImageReader/ImageDataDeserializer.h
+++ b/Source/Readers/ImageReader/ImageDataDeserializer.h
@ -72,7 +72,8 @@ private:
    // Not using nocase_compare here as it's not correct on Linux.
    using PathReaderMap = std::unordered_map<std::string, std::shared_ptr<ByteReader>>;
-    void RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders);
+    using ReaderSequenceMap = std::map<std::string, std::map<std::string, size_t>>;
    void RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders, ReaderSequenceMap& readerSequences);
    cv::Mat ReadImage(size_t seqId, const std::string& path, bool grayscale);
    // REVIEW alexeyk: can potentially use vector instead of map. Need to handle default reader and resizing though.
@ -80,6 +81,7 @@ private:
    SeqReaderMap m_readers;
    FileByteReader m_defaultReader;
    int m_verbosity;
 };
 }}}
--- a/Source/Readers/ImageReader/ZipByteReader.cpp
+++ b/Source/Readers/ImageReader/ZipByteReader.cpp
@ -44,16 +44,46 @@ ZipByteReader::ZipPtr ZipByteReader::OpenZip()
    });
 }
-void ZipByteReader::Register(size_t seqId, const std::string& path)
+void ZipByteReader::Register(const std::map<std::string, size_t>& sequences)
 {
    auto zipFile = m_zips.pop_or_create([this]() { return OpenZip(); });
    zip_stat_t stat;
    zip_stat_init(&stat);
-    int err = zip_stat(zipFile.get(), path.c_str(), 0, &stat);
+
-    if (ZIP_ER_OK != err)
+    size_t numberOfEntries = 0;
-        RuntimeError("Failed to get file info of %s, zip library error: %s", path.c_str(), GetZipError(err).c_str());
+    size_t numEntries = zip_get_num_entries(zipFile.get(), 0);
-    m_seqIdToIndex[seqId] = std::make_pair(stat.index, stat.size);
+    for (size_t i = 0; i < numEntries; ++i) {
        int err = zip_stat_index(zipFile.get(), i, 0, &stat);
        if (ZIP_ER_OK != err)
            RuntimeError("Failed to get file info for index %d, zip library error: %s", (int)i, GetZipError(err).c_str());
        auto sequenceId = sequences.find(std::string(stat.name));
        if (sequenceId == sequences.end())
        {
            continue;
        }
        else
        {
            m_seqIdToIndex[sequenceId->second] = std::make_pair(stat.index, stat.size);
            numberOfEntries++;
        }
    }
    m_zips.push(std::move(zipFile));
    if (numberOfEntries != sequences.size())
    {
        // Not all sequences have been found. Let's print them out and throw.
        for (const auto& s : sequences)
        {
            auto index = m_seqIdToIndex.find(s.second);
            if (index == m_seqIdToIndex.end())
            {
                fprintf(stderr, "Sequence %s is not found in container %s.\n", s.first.c_str(), m_zipPath.c_str());
            }
        }
        RuntimeError("Cannot retrieve image data for some sequences. For more detail, please see the log file.");
    }
 }
 cv::Mat ZipByteReader::Read(size_t seqId, const std::string& path, bool grayscale)
--- a/Source/Readers/KaldiReaderReadme
+++ b/Source/Readers/KaldiReaderReadme
@ -22,17 +22,10 @@
 == Preeliminaries == 
 To build the cpu version, you have to install intel MKL blas library
 or ACML library first. Note that ACML is free, whereas MKL may not be.
 for MKL:
 1. Download from https://software.intel.com/en-us/intel-mkl
 for ACML:
 1. Download from
 http://developer.amd.com/tools-and-sdks/archive/amd-core-math-library-acml/acml-downloads-resources/
 We have seen some problems with some versions of the library on Intel
 processors, but have had success with acml-5-3-1-ifort-64bit.tgz
 for Kaldi:
 1. In kaldi-trunk/tools/Makefile, uncomment # OPENFST_VERSION = 1.4.1, and
   re-install OpenFst using the makefile.
@ -54,8 +47,7 @@ build in the directory "build" type
 (For an in source build, just run configure in the $CNTK directory).
 You will see various options for configure, as well as their default
-values.  CNTK needs a CPU math directory, either acml or mkl.  If you
+values.  CNTK needs a CPU math library (mkl). For GPU
 do not specify one and both are available, acml will be used.  For GPU
 use, a cuda and gdk directory are also required.  Similary, to build
 the kaldi plugin a kaldi directory is required.  You may also specify
 whether you want a debug or release build, as well as add additional
--- a/Source/Readers/LMSequenceReader/SequenceParser.h
+++ b/Source/Readers/LMSequenceReader/SequenceParser.h
@ -171,6 +171,12 @@ public:
    // setup all the state variables and state tables for state machine
    void Init();
    // convenience function for setting the flags
    inline unsigned int SetSequenceFlags()
    {
        return (m_beginSequence ? seqFlagStartLabel : 0) | (m_endSequence ? seqFlagStopLabel : 0) | seqFlagLineBreak;
    }
    // Parser destructor
    ~SequenceParser();
@ -334,8 +340,7 @@ public:
                case EndOfLine:
                    if (seqPos)
                    {
-                        SequencePosition sequencePos(numbers->size(), labels->size(),
+                        SequencePosition sequencePos(numbers->size(), labels->size(), SetSequenceFlags());
                                                     (m_beginSequence ? seqFlagStartLabel : 0) | (m_endSequence ? seqFlagStopLabel : 0) | seqFlagLineBreak);
                        // add a sequence element to the list
                        seqPos->push_back(sequencePos);
                        sequencePositionLast = sequencePos;
@ -429,8 +434,7 @@ public:
        // this could probably be fixed by taking another pass through the loop above, but this is easier
        if (seqPos)
        {
-            SequencePosition sequencePos(numbers->size(), labels->size(),
+            SequencePosition sequencePos(numbers->size(), labels->size(), SetSequenceFlags());
                                         m_beginSequence ? seqFlagStartLabel : 0 | m_endSequence ? seqFlagStopLabel : 0 | seqFlagLineBreak);
            // add the final sequence element if needed
            if (!(sequencePos.labelPos == sequencePositionLast.labelPos && sequencePos.numberPos == sequencePositionLast.numberPos))
            {
@ -510,6 +514,7 @@ public:
    using SequenceParser<NumType, LabelType>::m_totalNumbersConverted;
    using SequenceParser<NumType, LabelType>::m_dimLabelsOut;
    using SequenceParser<NumType, LabelType>::m_bufferStart;
    using SequenceParser<NumType, LabelType>::SetSequenceFlags;
    LMSequenceParser()
    {
        mFile = nullptr;
@ -594,8 +599,7 @@ public:
                labels->push_back(std::move(vstr[i])); // TODO: is this an entire sequence, or multiple columns describing a single token?
            // add a sequence element to the list
-            SequencePosition sequencePos(numbers->size(), labels->size(),
+            SequencePosition sequencePos(numbers->size(), labels->size(), SetSequenceFlags());
                                         m_beginSequence ? seqFlagStartLabel : 0 | m_endSequence ? seqFlagStopLabel : 0 | seqFlagLineBreak);
            seqPos->push_back(sequencePos);
            lineCount++;
--- a/Source/Readers/ReaderLib/BlockRandomizer.cpp
+++ b/Source/Readers/ReaderLib/BlockRandomizer.cpp
@ -80,8 +80,8 @@ void BlockRandomizer::StartEpoch(const EpochConfiguration& config)
 #ifdef _DEBUG
    size_t epochStartFrame = config.m_epochIndex * m_epochSize;
-    fprintf(stderr, "BlockRandomizer::StartEpoch: epoch %" PRIu64 ": frames [%" PRIu64 "..%" PRIu64 "] (first sequence at sample %" PRIu64 "), data subset %" PRIu64 " of %" PRIu64 "\n",
+    fprintf(stderr, "BlockRandomizer::StartEpoch: epoch %" PRIu64 ": samples [%" PRIu64 "..%" PRIu64 "] (first sequence at sample %" PRIu64 "), worker rank %" PRIu64 ", total workers %" PRIu64 "\n",
-            config.m_epochIndex,
+            config.m_epochIndex + 1,
            epochStartFrame,
            epochStartFrame + m_epochSize,
            m_globalSamplePosition,
@ -107,7 +107,7 @@ void BlockRandomizer::PrepareNewSweepIfNeeded(size_t samplePosition)
        m_chunkRandomizer->Randomize((unsigned int)m_sweep);
        // Resetting sequence randomizer.
-        m_sequenceRandomizer->Reset(m_sweep + 1);
+        m_sequenceRandomizer->Reset(m_sweep);
        m_lastSeenChunkId = CHUNKID_MAX;
    }
 }
@ -138,8 +138,8 @@ Sequences BlockRandomizer::GetNextSequences(size_t sampleCount)
    if (m_verbosity >= Debug)
        fprintf(stderr, "BlockRandomizer::GetNextSequences(): getting %" PRIu64 " out of %" PRIu64 " sequences for %" PRIu64 " requested samples in sweep %" PRIu64 "\n",
            sequences.size(),
            decimated.size(),
            sequences.size(),
            sampleCount,
            m_sweep);
--- a/Source/Readers/ReaderLib/ChunkRandomizer.cpp
+++ b/Source/Readers/ReaderLib/ChunkRandomizer.cpp
@ -10,25 +10,6 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
    // NOTE: This is an old code, used for legacy randomization to make sure we preserve the same behavior for the tests.
    // TODO: Deprecate when the new randomizer is in place.
    template <typename TVector>
    void RandomShuffle(TVector& v, size_t randomSeed)
    {
        if (v.size() > RAND_MAX * static_cast<size_t>(RAND_MAX))
        {
            RuntimeError("RandomShuffle: too large set: need to change to different random generator!");
        }
        srand(static_cast<unsigned int>(randomSeed));
        foreach_index(currentLocation, v)
        {
            // Pick a random location a location and swap with current
            const size_t randomLocation = rand(0, v.size());
            std::swap(v[currentLocation], v[randomLocation]);
        }
    }
    ChunkRandomizer::ChunkRandomizer(IDataDeserializerPtr deserializer, size_t randomizationRangeInSamples, bool legacy) :
        m_deserializer(deserializer), m_legacy(legacy), m_randomizationRangeInSamples(randomizationRangeInSamples)
    {
@ -52,15 +33,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            randomizedChunkIndices.push_back(i);
        }
-        if (m_legacy)
+        m_rng.seed(seed);
-        {
+        RandomShuffleMT(randomizedChunkIndices, m_rng);
            RandomShuffle(randomizedChunkIndices, seed);
        }
        else
        {
            std::mt19937 m_rng(static_cast<int>(seed));
            std::shuffle(randomizedChunkIndices.begin(), randomizedChunkIndices.end(), m_rng);
        }
        // Place randomized chunks on the timeline
        m_randomizedChunks.clear();
--- a/Source/Readers/ReaderLib/ChunkRandomizer.h
+++ b/Source/Readers/ReaderLib/ChunkRandomizer.h
@ -7,6 +7,7 @@
 #include <vector>
 #include "DataDeserializer.h"
 #include <random>
 namespace Microsoft { namespace MSR { namespace CNTK {
@ -68,6 +69,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        bool m_legacy;
        // Randomization range in samples.
        size_t m_randomizationRangeInSamples;
        std::mt19937_64 m_rng;
    };
    typedef std::shared_ptr<ChunkRandomizer> ChunkRandomizerPtr;
--- a/Source/Readers/ReaderLib/SequenceRandomizer.cpp
+++ b/Source/Readers/ReaderLib/SequenceRandomizer.cpp
@ -45,7 +45,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // Resets the current sweep according to the randomization seed provided.
    void SequenceRandomizer::Reset(size_t randSeed)
    {
-        srand((unsigned int)randSeed);
+        m_rng.seed((unsigned long)randSeed);
        m_sequenceWindow.clear();
        m_chunkWindow.clear();
@ -197,7 +197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (;;)
            {
                // Pick a sequence position from [posBegin, posEnd)
-                const size_t j = rand(posBegin, posEnd);
+                const size_t j = RandMT(posBegin, posEnd, m_rng);
                // Pick up j sequence.
                ChunkIdType jChunkIndex = GetChunkIndexForSequencePosition(j);
--- a/Source/Readers/ReaderLib/SequenceRandomizer.h
+++ b/Source/Readers/ReaderLib/SequenceRandomizer.h
@ -11,6 +11,7 @@
 #include "DataDeserializer.h"
 #include "ChunkRandomizer.h"
 #include <deque>
 #include <random>
 namespace Microsoft { namespace MSR { namespace CNTK {
@ -164,6 +165,8 @@ private:
    // General configuration
    int m_verbosity;
    std::mt19937_64 m_rng;
 };
 typedef std::shared_ptr<SequenceRandomizer> SequenceRandomizerPtr;
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -40,30 +40,10 @@ template SGD<double>::SGD(const ScriptableObjects::IConfigRecord&);
 // -----------------------------------------------------------------------
 template <class ElemType>
-void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn, DEVICEID_TYPE deviceId,
+void SGD<ElemType>::Train(shared_ptr<ComputationNetwork> net, DEVICEID_TYPE deviceId,
                          IDataReader* trainSetDataReader,
-                          IDataReader* validationSetDataReader,
+                          IDataReader* validationSetDataReader, int startEpoch, bool loadNetworkFromCheckpoint)
                          const bool makeMode)
 {
    // determine which epoch to start with, including recovering a checkpoint if any and 'makeMode' enabled
    int startEpoch = DetermineStartEpoch(makeMode);
    if (startEpoch == m_maxEpochs)
    {
        LOGPRINTF(stderr, "No further training is necessary.\n");
        return;
    }
    wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
    bool loadNetworkFromCheckpoint = startEpoch >= 0;
    fprintf(stderr, "\n");
    if (loadNetworkFromCheckpoint)
        LOGPRINTF(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
    else
        LOGPRINTF(stderr, "Creating virgin network.\n");
    // create or load from checkpoint
    shared_ptr<ComputationNetwork> net = !loadNetworkFromCheckpoint ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
    // log the device we are computing on
    LOGPRINTF(stderr, "%s model with %d nodes", loadNetworkFromCheckpoint ? "Loaded" : "Created", (int)net->GetTotalNumberOfNodes());
    if (net->GetDeviceId() < 0)
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -110,6 +110,8 @@ struct SGDParams : public ScriptableObjects::Object
    // SGDParams(SGDParams&&) = default; // (does not compile in VS 2013; not critical)
    size_t GetMaxEpochs() { return m_maxEpochs; }
 protected:
    // learning rate per sample provided outside
    floatargvector m_learningRatesParam;
@ -342,10 +344,9 @@ public:
            m_parallelizationMethod = ParallelizationMethod::none;
    }
-    void Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn, DEVICEID_TYPE deviceId,
+    void Train(shared_ptr<ComputationNetwork> net, DEVICEID_TYPE deviceId,
               IDataReader* trainSetDataReader,
-               IDataReader* validationSetDataReader,
+               IDataReader* validationSetDataReader, int startEpoch, bool loadNetworkFromCheckpoint);
               const bool makeMode = true);
    void Adapt(wstring origModelFileName, wstring refNodeName,
               IDataReader* trainSetDataReader,
               IDataReader* validationSetDataReader,
@ -483,6 +484,10 @@ public:
                               const double L1RegWeight,
                               const bool needAveMultiplier,
                               const bool useNesterovMomentum);
    // return -1 if nothing exists
    int DetermineStartEpoch(const bool makeMode);
    wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false);
 protected:
    // UpdateWeights - update the weights in
@ -517,10 +522,6 @@ protected:
                            /*out*/ size_t& minibatchSize);
    wstring GetCheckPointFileNameForEpoch(const int epoch);
    wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false);
    // return -1 if nothing exists
    int DetermineStartEpoch(const bool makeMode);
    GradientsUpdateType GradUpdateType() const
    {
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@ -180,7 +180,7 @@ public:
                    m_gradHeader.reset(DistGradHeader::Create(evalNodes.size()), [](DistGradHeader* ptr) {
                        DistGradHeader::Destroy(ptr);
                    });
-                    m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false, m_traceLevel);
+                    m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, 0 /*syncStatsTrace*/);
                }
                m_gradHeader->numEvalNode = evalNodes.size();
--- a/Tests/EndToEndTests/CNTKv2Library/UnitTests/run-test
+++ b/Tests/EndToEndTests/CNTKv2Library/UnitTests/run-test
@ -21,6 +21,7 @@ mkdir $DataDir
 cp -R $DataSourceDir/MNIST/v0/Train-28x28_cntk_text.txt $DataDir || exit $?
 cp -R $DataSourceDir/CIFAR/v0/cifar-10-batches-py $DataDir || exit $?
 cp -R $TEST_DIR/../../../../Examples/Other/Simple2d/Data/SimpleDataTrain_cntk_text.txt $DataDir || exit $?
 cp -R $TEST_DIR/../../Text/SequenceClassification/Data/Train.ctf $DataDir || exit $?
 pushd $DataDir
--- a/Tests/EndToEndTests/Examples/Image/MNIST/01_OneHidden/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/01_OneHidden/baseline.linux.txt
@ -272,8 +272,8 @@ Post-processing network...
 4 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
-	errTop1 = ClassificationError()
+	errTop1 = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 17 nodes to process in pass 1.
@ -292,9 +292,9 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 200], [200 x 1 x *] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating --> unnamed81 = LearnableParameter() :  -> [1 x 1]
-Validating --> errTop1 = ClassificationError (labels, ol.z, unnamed81) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]
+Validating --> errTop1 = ErrorPrediction (labels, ol.z, unnamed81) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]
 Validating network. 9 nodes to process in pass 2.
@ -314,8 +314,8 @@ Post-processing network complete.
 05/13/2016 15:10:02: Evaluation criterion node(s):
-05/13/2016 15:10:02: 	errTop1 = ClassificationError
+05/13/2016 15:10:02: 	errTop1 = ErrorPrediction
-05/13/2016 15:10:02: 	err = ClassificationError
+05/13/2016 15:10:02: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -390,8 +390,8 @@ Post-processing network...
 4 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
-	errTop1 = ClassificationError()
+	errTop1 = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 17 nodes to process in pass 1.
@ -410,9 +410,9 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 200], [200 x 1 x *1] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating --> unnamed81 = LearnableParameter() :  -> [1 x 1]
-Validating --> errTop1 = ClassificationError (labels, ol.z, unnamed81) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]
+Validating --> errTop1 = ErrorPrediction (labels, ol.z, unnamed81) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]
 Validating network. 9 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/MNIST/01_OneHidden/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/01_OneHidden/baseline.windows.txt
@ -270,8 +270,8 @@ Post-processing network...
 4 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
-	errTop1 = ClassificationError()
+	errTop1 = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 17 nodes to process in pass 1.
@ -290,9 +290,9 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 200], [200 x 1 x *] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating --> unnamed81 = LearnableParameter() :  -> [1 x 1]
-Validating --> errTop1 = ClassificationError (labels, ol.z, unnamed81) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]
+Validating --> errTop1 = ErrorPrediction (labels, ol.z, unnamed81) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]
 Validating network. 9 nodes to process in pass 2.
@ -312,8 +312,8 @@ Post-processing network complete.
 05/13/2016 08:15:53: Evaluation criterion node(s):
-05/13/2016 08:15:53: 	errTop1 = ClassificationError
+05/13/2016 08:15:53: 	errTop1 = ErrorPrediction
-05/13/2016 08:15:53: 	err = ClassificationError
+05/13/2016 08:15:53: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -388,8 +388,8 @@ Post-processing network...
 4 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
-	errTop1 = ClassificationError()
+	errTop1 = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 17 nodes to process in pass 1.
@ -408,9 +408,9 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 200], [200 x 1 x *1] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating --> unnamed81 = LearnableParameter() :  -> [1 x 1]
-Validating --> errTop1 = ClassificationError (labels, ol.z, unnamed81) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]
+Validating --> errTop1 = ErrorPrediction (labels, ol.z, unnamed81) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]
 Validating network. 9 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/MNIST/02_Convolution/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/02_Convolution/baseline.linux.txt
@ -284,7 +284,7 @@ Post-processing network...
 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 27 nodes to process in pass 1.
@ -315,7 +315,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x 1 x *] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating network. 16 nodes to process in pass 2.
@ -343,7 +343,7 @@ Post-processing network complete.
 05/13/2016 15:10:11: Evaluation criterion node(s):
-05/13/2016 15:10:11: 	err = ClassificationError
+05/13/2016 15:10:11: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -429,7 +429,7 @@ Post-processing network...
 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 27 nodes to process in pass 1.
@ -460,7 +460,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x 1 x *1] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating network. 16 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/MNIST/02_Convolution/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/02_Convolution/baseline.windows.txt
@ -282,7 +282,7 @@ Post-processing network...
 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 27 nodes to process in pass 1.
@ -313,7 +313,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x 1 x *] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating network. 16 nodes to process in pass 2.
@ -341,7 +341,7 @@ Post-processing network complete.
 05/13/2016 08:16:18: Evaluation criterion node(s):
-05/13/2016 08:16:18: 	err = ClassificationError
+05/13/2016 08:16:18: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -427,7 +427,7 @@ Post-processing network...
 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 27 nodes to process in pass 1.
@ -458,7 +458,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x 1 x *1] -> [10 x 1
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating network. 16 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/MNIST/03_ConvBatchNorm/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/03_ConvBatchNorm/baseline.linux.txt
@ -287,7 +287,7 @@ Post-processing network...
 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 36 nodes to process in pass 1.
@ -329,7 +329,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x *] -> [10 x *]
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating network. 16 nodes to process in pass 2.
@ -363,7 +363,7 @@ Post-processing network complete.
 05/13/2016 15:10:29: Evaluation criterion node(s):
-05/13/2016 15:10:29: 	err = ClassificationError
+05/13/2016 15:10:29: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -462,7 +462,7 @@ Post-processing network...
 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 36 nodes to process in pass 1.
@ -502,7 +502,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x *1] -> [10 x *1]
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating network. 16 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/MNIST/03_ConvBatchNorm/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/MNIST/03_ConvBatchNorm/baseline.windows.txt
@ -285,7 +285,7 @@ Post-processing network...
 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 36 nodes to process in pass 1.
@ -327,7 +327,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x *] -> [10 x *]
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x *], [10 x 1] -> [10 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating network. 16 nodes to process in pass 2.
@ -361,7 +361,7 @@ Post-processing network complete.
 05/13/2016 08:16:58: Evaluation criterion node(s):
-05/13/2016 08:16:58: 	err = ClassificationError
+05/13/2016 08:16:58: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -460,7 +460,7 @@ Post-processing network...
 3 roots:
 	ce = CrossEntropyWithSoftmax()
-	err = ClassificationError()
+	err = ErrorPrediction()
 	ol.z = Plus()
 Validating network. 36 nodes to process in pass 1.
@ -500,7 +500,7 @@ Validating --> ol.t = Times (ol.W, h1.y) : [10 x 128], [128 x *1] -> [10 x *1]
 Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
 Validating --> ol.z = Plus (ol.t, ol.b) : [10 x *1], [10 x 1] -> [10 x 1 x *1]
 Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> err = ClassificationError (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> err = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating network. 16 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/baseline.linux.txt
@ -1,49 +1,62 @@
-=== Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/gpu/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/../../../../Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/Config/01_Conv.cntk currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10 OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu DeviceId=0 timestamping=true Train=[SGD=[maxEpochs=10]] Train=[SGD=[epochSize=100]] stderr=-
+CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 264172964 kB
 -------------------------------------------------------------------
 === Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/gpu/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10 OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu DeviceId=0 timestamping=true Train=[SGD=[maxEpochs=10]] Train=[SGD=[epochSize=100]] stderr=-
 -------------------------------------------------------------------
 Build info: 
-		Built time: May 13 2016 14:50:25
+		Built time: Aug 16 2016 09:41:56
-		Last modified date: Thu May 12 14:00:37 2016
+		Last modified date: Fri Aug 12 07:32:43 2016
 		Build type: release
 		Build target: GPU
 		With 1bit-SGD: no
-		Math lib: acml
+		Math lib: mkl
 		CUDA_PATH: /usr/local/cuda-7.5
 		CUB_PATH: /usr/local/cub-1.4.1
 		CUDNN_PATH: /usr/local/cudnn-4.0
 		Build Branch: HEAD
-		Build SHA1: 35fadc316f045d843bbd9b85061250a959268787
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
-		Built by philly on d8dc82703b0f
+		Built by philly on f67b30a647de
 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 -------------------------------------------------------------------
-Changed current directory to /tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+Changed current directory to /tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-05/13/2016 15:10:47: Redirecting stderr to file -_Train_Test.log
+08/16/2016 10:50:36: Redirecting stderr to file -_Train_Test.log
-05/13/2016 15:10:47: -------------------------------------------------------------------
+08/16/2016 10:50:36: -------------------------------------------------------------------
-05/13/2016 15:10:47: Build info: 
+08/16/2016 10:50:36: Build info: 
-05/13/2016 15:10:47: 		Built time: May 13 2016 14:50:25
+08/16/2016 10:50:36: 		Built time: Aug 16 2016 09:41:56
-05/13/2016 15:10:47: 		Last modified date: Thu May 12 14:00:37 2016
+08/16/2016 10:50:36: 		Last modified date: Fri Aug 12 07:32:43 2016
-05/13/2016 15:10:47: 		Build type: release
+08/16/2016 10:50:36: 		Build type: release
-05/13/2016 15:10:47: 		Build target: GPU
+08/16/2016 10:50:36: 		Build target: GPU
-05/13/2016 15:10:47: 		With 1bit-SGD: no
+08/16/2016 10:50:36: 		With 1bit-SGD: no
-05/13/2016 15:10:47: 		Math lib: acml
+08/16/2016 10:50:36: 		Math lib: mkl
-05/13/2016 15:10:47: 		CUDA_PATH: /usr/local/cuda-7.5
+08/16/2016 10:50:36: 		CUDA_PATH: /usr/local/cuda-7.5
-05/13/2016 15:10:47: 		CUB_PATH: /usr/local/cub-1.4.1
+08/16/2016 10:50:36: 		CUB_PATH: /usr/local/cub-1.4.1
-05/13/2016 15:10:47: 		CUDNN_PATH: /usr/local/cudnn-4.0
+08/16/2016 10:50:36: 		CUDNN_PATH: /usr/local/cudnn-4.0
-05/13/2016 15:10:47: 		Build Branch: HEAD
+08/16/2016 10:50:36: 		Build Branch: HEAD
-05/13/2016 15:10:47: 		Build SHA1: 35fadc316f045d843bbd9b85061250a959268787
+08/16/2016 10:50:36: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
-05/13/2016 15:10:47: 		Built by philly on d8dc82703b0f
+08/16/2016 10:50:36: 		Built by philly on f67b30a647de
-05/13/2016 15:10:47: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
+08/16/2016 10:50:36: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
-05/13/2016 15:10:47: -------------------------------------------------------------------
+08/16/2016 10:50:36: -------------------------------------------------------------------
 08/16/2016 10:50:37: -------------------------------------------------------------------
 08/16/2016 10:50:37: GPU info:
-05/13/2016 15:10:47: Running on localhost at 2016/05/13 15:10:47
+08/16/2016 10:50:37: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
-05/13/2016 15:10:47: Command line: 
+08/16/2016 10:50:37: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
-/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/gpu/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/../../../../Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/Config/01_Conv.cntk  currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData  RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu  DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10  OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu  DeviceId=0  timestamping=true  Train=[SGD=[maxEpochs=10]]  Train=[SGD=[epochSize=100]]  stderr=-
+08/16/2016 10:50:37: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:50:37: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:50:37: -------------------------------------------------------------------
 08/16/2016 10:50:37: Running on localhost at 2016/08/16 10:50:37
 08/16/2016 10:50:37: Command line: 
 /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/gpu/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk  currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData  RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu  DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10  OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu  DeviceId=0  timestamping=true  Train=[SGD=[maxEpochs=10]]  Train=[SGD=[epochSize=100]]  stderr=-
-05/13/2016 15:10:47: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:50:37: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-05/13/2016 15:10:47: RootDir = "."
+08/16/2016 10:50:37: RootDir = "."
 ConfigDir = "$RootDir$"
 DataDir = "$RootDir$"
 OutputDir = "$RootDir$/Output"
@ -53,7 +66,6 @@ precision = "float"
 deviceId = 0
 imageLayout = "cudnn"
 initOnCPUOnly=true
 prefetch = "true"
 command = Train:Test
 modelPath = "$ModelDir$/01_Convolution"
 stderr = "$OutputDir$/01_Conv"
@ -86,7 +98,7 @@ Train = [
                format = "dense"
            ]
        ]
-    ]
+    ]    
 ]
 Test = [
    action = "test"
@ -104,42 +116,41 @@ Test = [
                format = "dense"
            ]
        ]
-    ]   
+    ]    
 ]
-currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
-DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
-OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
 DeviceId=0
 timestamping=true
 Train=[SGD=[maxEpochs=10]]
 Train=[SGD=[epochSize=100]]
 stderr=-
-05/13/2016 15:10:47: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:50:37: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
-05/13/2016 15:10:47: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:50:37: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-05/13/2016 15:10:47: RootDir = "."
+08/16/2016 10:50:37: RootDir = "."
 ConfigDir = "."
 DataDir = "."
 OutputDir = "./Output"
-ModelDir = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models"
+ModelDir = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models"
-ndlMacros = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl"
+ndlMacros = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl"
 precision = "float"
 deviceId = 0
 imageLayout = "cudnn"
 initOnCPUOnly=true
 prefetch = "true"
 command = Train:Test
-modelPath = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution"
+modelPath = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution"
-stderr = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/01_Conv"
+stderr = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/01_Conv"
 traceLevel = 1
 numMBsToShowResult = 500
 Train = [
    action = "train"
     NDLNetworkBuilder = [
-        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl"
+        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl"
    ]
    SGD = [
        epochSize = 49984
@ -152,7 +163,7 @@ Train = [
    ]
    reader = [
        readerType = "CNTKTextFormatReader"
-        file = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Train_cntk_text.txt"
+        file = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Train_cntk_text.txt"
        input = [
            features = [
                dim = 3072
@ -163,14 +174,14 @@ Train = [
                format = "dense"
            ]
        ]
-    ]
+    ]    
 ]
 Test = [
    action = "test"
    minibatchSize = 16
    reader = [
        readerType = "CNTKTextFormatReader"
-        file = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Test_cntk_text.txt"
+        file = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Test_cntk_text.txt"
        input = [
            features = [
                dim = 3072
@ -181,45 +192,44 @@ Test = [
                format = "dense"
            ]
        ]
-    ]   
+    ]    
 ]
-currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
-DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
+ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
-OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
 DeviceId=0
 timestamping=true
 Train=[SGD=[maxEpochs=10]]
 Train=[SGD=[epochSize=100]]
 stderr=-
-05/13/2016 15:10:47: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:50:37: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/13/2016 15:10:47: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 10:50:37: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: 01_Conv.cntk:command=Train:Test
-configparameters: 01_Conv.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
+configparameters: 01_Conv.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10
-configparameters: 01_Conv.cntk:currentDirectory=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+configparameters: 01_Conv.cntk:currentDirectory=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
-configparameters: 01_Conv.cntk:DataDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
+configparameters: 01_Conv.cntk:DataDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData
 configparameters: 01_Conv.cntk:deviceId=0
 configparameters: 01_Conv.cntk:imageLayout=cudnn
 configparameters: 01_Conv.cntk:initOnCPUOnly=true
-configparameters: 01_Conv.cntk:ModelDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models
+configparameters: 01_Conv.cntk:ModelDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models
-configparameters: 01_Conv.cntk:modelPath=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution
+configparameters: 01_Conv.cntk:modelPath=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution
-configparameters: 01_Conv.cntk:ndlMacros=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+configparameters: 01_Conv.cntk:ndlMacros=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
 configparameters: 01_Conv.cntk:numMBsToShowResult=500
-configparameters: 01_Conv.cntk:OutputDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+configparameters: 01_Conv.cntk:OutputDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
 configparameters: 01_Conv.cntk:precision=float
 configparameters: 01_Conv.cntk:prefetch=true
 configparameters: 01_Conv.cntk:RootDir=.
-configparameters: 01_Conv.cntk:RunDir=/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
+configparameters: 01_Conv.cntk:RunDir=/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu
 configparameters: 01_Conv.cntk:stderr=-
 configparameters: 01_Conv.cntk:Test=[
    action = "test"
    minibatchSize = 16
    reader = [
        readerType = "CNTKTextFormatReader"
-        file = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Test_cntk_text.txt"
+        file = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Test_cntk_text.txt"
        input = [
            features = [
                dim = 3072
@ -230,7 +240,7 @@ configparameters: 01_Conv.cntk:Test=[
                format = "dense"
            ]
        ]
-    ]   
+    ]    
 ]
 configparameters: 01_Conv.cntk:timestamping=true
@ -238,7 +248,7 @@ configparameters: 01_Conv.cntk:traceLevel=1
 configparameters: 01_Conv.cntk:Train=[
    action = "train"
     NDLNetworkBuilder = [
-        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl"
+        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/../../../../../../../Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl"
    ]
    SGD = [
        epochSize = 49984
@ -251,7 +261,7 @@ configparameters: 01_Conv.cntk:Train=[
    ]
    reader = [
        readerType = "CNTKTextFormatReader"
-        file = "/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Train_cntk_text.txt"
+        file = "/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/TestData/Train_cntk_text.txt"
        input = [
            features = [
                dim = 3072
@ -262,33 +272,57 @@ configparameters: 01_Conv.cntk:Train=[
                format = "dense"
            ]
        ]
-    ]
+    ]    
 ] [SGD=[maxEpochs=10]] [SGD=[epochSize=100]]
-05/13/2016 15:10:47: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 10:50:37: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/13/2016 15:10:47: Commands: Train Test
+08/16/2016 10:50:37: Commands: Train Test
-05/13/2016 15:10:47: Precision = "float"
+08/16/2016 10:50:37: Precision = "float"
-05/13/2016 15:10:47: CNTKModelPath: /tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution
+08/16/2016 10:50:37: CNTKModelPath: /tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution
-05/13/2016 15:10:47: CNTKCommandTrainInfo: Train : 10
+08/16/2016 10:50:37: CNTKCommandTrainInfo: Train : 10
-05/13/2016 15:10:47: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 10
+08/16/2016 10:50:37: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 10
-05/13/2016 15:10:47: ##############################################################################
+08/16/2016 10:50:37: ##############################################################################
-05/13/2016 15:10:47: #                                                                            #
+08/16/2016 10:50:37: #                                                                            #
-05/13/2016 15:10:47: # Action "train"                                                             #
+08/16/2016 10:50:37: # Action "train"                                                             #
-05/13/2016 15:10:47: #                                                                            #
+08/16/2016 10:50:37: #                                                                            #
-05/13/2016 15:10:47: ##############################################################################
+08/16/2016 10:50:37: ##############################################################################
-05/13/2016 15:10:47: CNTKCommandTrainBegin: Train
+08/16/2016 10:50:37: CNTKCommandTrainBegin: Train
 NDLBuilder Using GPU 0
-05/13/2016 15:10:47: Creating virgin network.
+08/16/2016 10:50:37: Creating virgin network.
 Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
 Node 'conv1_act.W' (LearnableParameter operation): Initializing Parameter[32 x 75] <- 0.000000.
 Node 'conv1_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 32] <- 0.000000.
 Node 'conv2_act.W' (LearnableParameter operation): Initializing Parameter[32 x 800] <- 0.000000.
 Node 'conv2_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 32] <- 0.000000.
 Node 'conv3_act.W' (LearnableParameter operation): Initializing Parameter[64 x 800] <- 0.000000.
 Node 'conv3_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 64] <- 0.000000.
 Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 3 x 3 x 64] <- 0.000000.
 Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
 Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- 0.000000.
 Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
 Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
 Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
 Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
 Node 'conv1_act.W' (LearnableParameter operation): Initializing Parameter[32 x 75] <- gaussian(seed=1, range=0.023094*0.004300, onCPU=false).
 SetGaussianRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
 Node 'conv1_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 32] <- 0.000000.
 Node 'conv2_act.W' (LearnableParameter operation): Initializing Parameter[32 x 800] <- gaussian(seed=2, range=0.007071*1.414000, onCPU=false).
 Node 'conv2_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 32] <- 0.000000.
 Node 'conv3_act.W' (LearnableParameter operation): Initializing Parameter[64 x 800] <- gaussian(seed=3, range=0.007071*1.414000, onCPU=false).
 Node 'conv3_act.b' (LearnableParameter operation): Initializing Parameter[1 x 1 x 64] <- 0.000000.
 Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 3 x 3 x 64] <- gaussian(seed=4, range=0.008333*12.000000, onCPU=false).
 Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
 Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- gaussian(seed=5, range=0.025000*1.500000, onCPU=false).
 Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
 Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 34 nodes to process in pass 1.
@ -326,7 +360,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1_d) : [10 x 64], [64 x 1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *], [10] -> [10 x 1 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating network. 21 nodes to process in pass 2.
@ -334,165 +368,183 @@ Validating network. 21 nodes to process in pass 2.
 Validating network, final pass.
-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv1_act.c: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv2_act.c: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool2: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv3_act.c: using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool3: using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
 13 out of 34 nodes do not share the minibatch layout with the input data.
 Post-processing network complete.
-05/13/2016 15:10:48: Created model with 34 nodes on GPU 0.
+08/16/2016 10:50:38: Created model with 34 nodes on GPU 0.
-05/13/2016 15:10:48: Training criterion node(s):
+08/16/2016 10:50:38: Training criterion node(s):
-05/13/2016 15:10:48: 	CE = CrossEntropyWithSoftmax
+08/16/2016 10:50:38: 	CE = CrossEntropyWithSoftmax
-05/13/2016 15:10:48: Evaluation criterion node(s):
+08/16/2016 10:50:38: Evaluation criterion node(s):
-
+08/16/2016 10:50:38: 	Err = ErrorPrediction
 05/13/2016 15:10:48: 	Err = ClassificationError
 Allocating matrices for forward and/or backward propagation.
-Memory Sharing Structure:
+Memory Sharing: Out of 63 matrices, 38 are shared as 17, and 25 are not shared.
-(nil): {[Err Gradient[1]] [featOffs Gradient[1 x 1]] [featScaled Gradient[32 x 32 x 3 x *]] [features Gradient[32 x 32 x 3 x *]] [labels Gradient[10 x *]] }
+	{ conv1_act.W : [32 x 75] (gradient)
-0x2485d28: {[OutputNodes.z Value[10 x 1 x *]] }
+	  conv1_act.p : [32 x 32 x 32 x *] }
-0x2485ee8: {[CE Value[1]] }
+	{ conv1_act.c : [32 x 32 x 32 x *] (gradient)
-0x2486168: {[conv1_act.W Gradient[32 x 75]] [conv1_act.p Value[32 x 32 x 32 x *]] }
+	  conv1_act.y : [32 x 32 x 32 x *] }
-0x2486328: {[conv1_act.c Gradient[32 x 32 x 32 x *]] [conv1_act.y Value[32 x 32 x 32 x *]] }
+	{ conv1_act.p : [32 x 32 x 32 x *] (gradient)
-0x24864e8: {[conv1_act.p Gradient[32 x 32 x 32 x *]] [pool1 Value[15 x 15 x 32 x *]] }
+	  pool1 : [15 x 15 x 32 x *] }
-0x249a638: {[features Value[32 x 32 x 3 x *]] }
+	{ conv1_act.b : [1 x 1 x 32] (gradient)
-0x2975298: {[conv1_act.b Value[1 x 1 x 32]] }
+	  conv1_act.y : [32 x 32 x 32 x *] (gradient) }
-0x2976b48: {[conv2_act.W Value[32 x 800]] }
+	{ conv2_act.W : [32 x 800] (gradient)
-0x2977ae8: {[conv2_act.b Value[1 x 1 x 32]] }
+	  conv2_act.p : [15 x 15 x 32 x *] }
-0x2979668: {[conv3_act.W Value[64 x 800]] }
+	{ conv2_act.c : [15 x 15 x 32 x *] (gradient)
-0x2979f08: {[conv3_act.b Value[1 x 1 x 64]] }
+	  conv2_act.y : [15 x 15 x 32 x *] }
-0x297bae8: {[h1.W Value[64 x 3 x 3 x 64]] }
+	{ conv2_act.p : [15 x 15 x 32 x *] (gradient)
-0x297c538: {[h1.b Value[64 x 1]] }
+	  pool1 : [15 x 15 x 32 x *] (gradient)
-0x297d5c8: {[OutputNodes.W Value[10 x 64]] }
+	  pool2 : [7 x 7 x 32 x *] }
-0x297ea98: {[OutputNodes.b Value[10]] }
+	{ conv2_act.b : [1 x 1 x 32] (gradient)
-0x2dd1458: {[featOffs Value[1 x 1]] }
+	  conv2_act.y : [15 x 15 x 32 x *] (gradient) }
-0x2dd2678: {[labels Value[10 x *]] }
+	{ conv3_act.W : [64 x 800] (gradient)
-0x2dd2eb8: {[conv1_act.W Value[32 x 75]] }
+	  conv3_act.p : [7 x 7 x 64 x *] }
-0x7a59dd8: {[Err Value[1]] }
+	{ conv3_act.c : [7 x 7 x 64 x *] (gradient)
-0x7a5d378: {[featScaled Value[32 x 32 x 3 x *]] }
+	  conv3_act.y : [7 x 7 x 64 x *] }
-0x7a5d6d8: {[conv1_act.c Value[32 x 32 x 32 x *]] }
+	{ conv3_act.p : [7 x 7 x 64 x *] (gradient)
-0x7a5e478: {[conv2_act.c Value[15 x 15 x 32 x *]] }
+	  pool2 : [7 x 7 x 32 x *] (gradient)
-0x7a5e638: {[conv1_act.b Gradient[1 x 1 x 32]] [conv1_act.y Gradient[32 x 32 x 32 x *]] }
+	  pool3 : [3 x 3 x 64 x *] }
-0x7a5e7f8: {[conv2_act.W Gradient[32 x 800]] [conv2_act.p Value[15 x 15 x 32 x *]] }
+	{ conv3_act.b : [1 x 1 x 64] (gradient)
-0x7a7ade8: {[conv2_act.c Gradient[15 x 15 x 32 x *]] [conv2_act.y Value[15 x 15 x 32 x *]] }
+	  conv3_act.y : [7 x 7 x 64 x *] (gradient)
-0x7a7afa8: {[conv2_act.p Gradient[15 x 15 x 32 x *]] [pool1 Gradient[15 x 15 x 32 x *]] [pool2 Value[7 x 7 x 32 x *]] }
+	  h1.t : [64 x *] }
-0x7a7b168: {[conv3_act.c Value[7 x 7 x 64 x *]] }
+	{ h1.W : [64 x 3 x 3 x 64] (gradient)
-0x7a7b328: {[conv2_act.b Gradient[1 x 1 x 32]] [conv2_act.y Gradient[15 x 15 x 32 x *]] }
+	  h1.z : [64 x 1 x *] }
-0x7a7b4e8: {[conv3_act.W Gradient[64 x 800]] [conv3_act.p Value[7 x 7 x 64 x *]] }
+	{ h1.t : [64 x *] (gradient)
-0x7a7b6a8: {[conv3_act.c Gradient[7 x 7 x 64 x *]] [conv3_act.y Value[7 x 7 x 64 x *]] }
+	  h1.y : [64 x 1 x *] }
-0x7a7b868: {[conv3_act.p Gradient[7 x 7 x 64 x *]] [pool2 Gradient[7 x 7 x 32 x *]] [pool3 Value[3 x 3 x 64 x *]] }
+	{ h1.z : [64 x 1 x *] (gradient)
-0x7a7ba28: {[conv3_act.b Gradient[1 x 1 x 64]] [conv3_act.y Gradient[7 x 7 x 64 x *]] [h1.t Value[64 x *]] }
+	  pool3 : [3 x 3 x 64 x *] (gradient) }
-0x7a7bbe8: {[h1.W Gradient[64 x 3 x 3 x 64]] [h1.z Value[64 x 1 x *]] }
+	{ OutputNodes.t : [10 x 1 x *]
-0x7a7bda8: {[h1.t Gradient[64 x *]] [h1.y Value[64 x 1 x *]] }
+	  h1.b : [64 x 1] (gradient)
-0x7a7bf68: {[h1_d Value[64 x 1 x *]] }
+	  h1.y : [64 x 1 x *] (gradient) }
-0x7a7c128: {[h1.z Gradient[64 x 1 x *]] [pool3 Gradient[3 x 3 x 64 x *]] }
+	{ OutputNodes.W : [10 x 64] (gradient)
-0x7a7c2e8: {[OutputNodes.t Value[10 x 1 x *]] [h1.b Gradient[64 x 1]] [h1.y Gradient[64 x 1 x *]] }
+	  OutputNodes.z : [10 x 1 x *] (gradient) }
 0x7a7cdc8: {[CE Gradient[1]] }
 0x7a7cf88: {[OutputNodes.W Gradient[10 x 64]] [OutputNodes.z Gradient[10 x 1 x *]] }
 0x7a7d148: {[OutputNodes.t Gradient[10 x 1 x *]] }
 0x7a7d308: {[OutputNodes.b Gradient[10]] }
 0x7a7d4c8: {[h1_d Gradient[64 x 1 x *]] }
 05/13/2016 15:10:48: No PreCompute nodes found, skipping PreCompute step.
-05/13/2016 15:10:48: Starting Epoch 1: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:38: Training 116906 parameters in 10 out of 10 parameter tensors and 29 nodes with gradient:
-05/13/2016 15:10:48: Starting minibatch loop.
+08/16/2016 10:50:38: 	Node 'OutputNodes.W' (LearnableParameter operation) : [10 x 64]
-05/13/2016 15:10:51: Finished Epoch[ 1 of 10]: [Training] CE = 2.30242050 * 100; Err = 0.88000000 * 100; totalSamplesSeen = 100; learningRatePerSample = 0.00015625; epochTime=3.55904s
+08/16/2016 10:50:38: 	Node 'OutputNodes.b' (LearnableParameter operation) : [10]
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.1'
+08/16/2016 10:50:38: 	Node 'conv1_act.W' (LearnableParameter operation) : [32 x 75]
 08/16/2016 10:50:38: 	Node 'conv1_act.b' (LearnableParameter operation) : [1 x 1 x 32]
 08/16/2016 10:50:38: 	Node 'conv2_act.W' (LearnableParameter operation) : [32 x 800]
 08/16/2016 10:50:38: 	Node 'conv2_act.b' (LearnableParameter operation) : [1 x 1 x 32]
 08/16/2016 10:50:38: 	Node 'conv3_act.W' (LearnableParameter operation) : [64 x 800]
 08/16/2016 10:50:38: 	Node 'conv3_act.b' (LearnableParameter operation) : [1 x 1 x 64]
 08/16/2016 10:50:38: 	Node 'h1.W' (LearnableParameter operation) : [64 x 3 x 3 x 64]
 08/16/2016 10:50:38: 	Node 'h1.b' (LearnableParameter operation) : [64 x 1]
-05/13/2016 15:10:51: Starting Epoch 2: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:38: No PreCompute nodes found, or all already computed. Skipping pre-computation step.
-05/13/2016 15:10:51: Starting minibatch loop.
+08/16/2016 10:50:38: Starting Epoch 1: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
-05/13/2016 15:10:51: Finished Epoch[ 2 of 10]: [Training] CE = 2.30175842 * 100; Err = 0.94000000 * 100; totalSamplesSeen = 200; learningRatePerSample = 0.00015625; epochTime=0.011903s
+BlockRandomizer::StartEpoch: epoch 0: frames [0..100] (first sequence at sample 0), data subset 0 of 1
 05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.2'
-05/13/2016 15:10:51: Starting Epoch 3: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:38: Starting minibatch loop.
 08/16/2016 10:50:41: Finished Epoch[ 1 of 10]: [Training] CE = 2.30223602 * 100; Err = 0.90000000 * 100; totalSamplesSeen = 100; learningRatePerSample = 0.00015625; epochTime=3.51082s
 08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.1'
-05/13/2016 15:10:51: Starting minibatch loop.
+08/16/2016 10:50:41: Starting Epoch 2: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
-05/13/2016 15:10:51: Finished Epoch[ 3 of 10]: [Training] CE = 2.30054413 * 100; Err = 0.90000000 * 100; totalSamplesSeen = 300; learningRatePerSample = 0.00015625; epochTime=0.012701s
+BlockRandomizer::StartEpoch: epoch 1: frames [100..200] (first sequence at sample 100), data subset 0 of 1
 05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.3'
-05/13/2016 15:10:51: Starting Epoch 4: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting minibatch loop.
 08/16/2016 10:50:41: Finished Epoch[ 2 of 10]: [Training] CE = 2.30189240 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 200; learningRatePerSample = 0.00015625; epochTime=0.012555s
 08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.2'
-05/13/2016 15:10:51: Starting minibatch loop.
+08/16/2016 10:50:41: Starting Epoch 3: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
-05/13/2016 15:10:51: Finished Epoch[ 4 of 10]: [Training] CE = 2.30022812 * 100; Err = 0.88000000 * 100; totalSamplesSeen = 400; learningRatePerSample = 0.00015625; epochTime=0.01144s
+BlockRandomizer::StartEpoch: epoch 2: frames [200..300] (first sequence at sample 200), data subset 0 of 1
 05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.4'
-05/13/2016 15:10:51: Starting Epoch 5: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting minibatch loop.
 08/16/2016 10:50:41: Finished Epoch[ 3 of 10]: [Training] CE = 2.29965256 * 100; Err = 0.86000000 * 100; totalSamplesSeen = 300; learningRatePerSample = 0.00015625; epochTime=0.012394s
 08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.3'
-05/13/2016 15:10:51: Starting minibatch loop.
+08/16/2016 10:50:41: Starting Epoch 4: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
-05/13/2016 15:10:51: Finished Epoch[ 5 of 10]: [Training] CE = 2.29579636 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 500; learningRatePerSample = 0.00015625; epochTime=0.011529s
+BlockRandomizer::StartEpoch: epoch 3: frames [300..400] (first sequence at sample 300), data subset 0 of 1
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.5'
+
 08/16/2016 10:50:41: Starting minibatch loop.
 08/16/2016 10:50:41: Finished Epoch[ 4 of 10]: [Training] CE = 2.29966064 * 100; Err = 0.91000000 * 100; totalSamplesSeen = 400; learningRatePerSample = 0.00015625; epochTime=0.0124s
 08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.4'
 08/16/2016 10:50:41: Starting Epoch 5: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
 BlockRandomizer::StartEpoch: epoch 4: frames [400..500] (first sequence at sample 400), data subset 0 of 1
 08/16/2016 10:50:41: Starting minibatch loop.
 08/16/2016 10:50:41: Finished Epoch[ 5 of 10]: [Training] CE = 2.30450394 * 100; Err = 0.94000000 * 100; totalSamplesSeen = 500; learningRatePerSample = 0.00015625; epochTime=0.012302s
 08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.5'
 Setting dropout rate to 0.5.
-05/13/2016 15:10:51: Starting Epoch 6: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 6: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
 BlockRandomizer::StartEpoch: epoch 5: frames [500..600] (first sequence at sample 500), data subset 0 of 1
-05/13/2016 15:10:51: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 5
-05/13/2016 15:10:51: Finished Epoch[ 6 of 10]: [Training] CE = 2.30121231 * 100; Err = 0.84000000 * 100; totalSamplesSeen = 600; learningRatePerSample = 0.00015625; epochTime=0.012276s
+08/16/2016 10:50:41: Finished Epoch[ 6 of 10]: [Training] CE = 2.29013916 * 100; Err = 0.81000000 * 100; totalSamplesSeen = 600; learningRatePerSample = 0.00015625; epochTime=0.012412s
-05/13/2016 15:10:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.6'
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.6'
-05/13/2016 15:10:51: Starting Epoch 7: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 7: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
 BlockRandomizer::StartEpoch: epoch 6: frames [600..700] (first sequence at sample 600), data subset 0 of 1
-05/13/2016 15:10:51: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 6
-05/13/2016 15:10:52: Finished Epoch[ 7 of 10]: [Training] CE = 2.28975647 * 100; Err = 0.93000000 * 100; totalSamplesSeen = 700; learningRatePerSample = 0.00015625; epochTime=0.011495s
+08/16/2016 10:50:41: Finished Epoch[ 7 of 10]: [Training] CE = 2.29815765 * 100; Err = 0.93000000 * 100; totalSamplesSeen = 700; learningRatePerSample = 0.00015625; epochTime=0.012303s
-05/13/2016 15:10:52: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.7'
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.7'
-05/13/2016 15:10:52: Starting Epoch 8: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 8: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
 BlockRandomizer::StartEpoch: epoch 7: frames [700..800] (first sequence at sample 700), data subset 0 of 1
-05/13/2016 15:10:52: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 7
-05/13/2016 15:10:52: Finished Epoch[ 8 of 10]: [Training] CE = 2.29035095 * 100; Err = 0.91000000 * 100; totalSamplesSeen = 800; learningRatePerSample = 0.00015625; epochTime=0.012157s
+08/16/2016 10:50:41: Finished Epoch[ 8 of 10]: [Training] CE = 2.28805603 * 100; Err = 0.89000000 * 100; totalSamplesSeen = 800; learningRatePerSample = 0.00015625; epochTime=0.012517s
-05/13/2016 15:10:52: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.8'
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.8'
-05/13/2016 15:10:52: Starting Epoch 9: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 9: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
 BlockRandomizer::StartEpoch: epoch 8: frames [800..900] (first sequence at sample 800), data subset 0 of 1
-05/13/2016 15:10:52: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 8
-05/13/2016 15:10:52: Finished Epoch[ 9 of 10]: [Training] CE = 2.29797729 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 900; learningRatePerSample = 0.00015625; epochTime=0.011451s
+08/16/2016 10:50:41: Finished Epoch[ 9 of 10]: [Training] CE = 2.29380524 * 100; Err = 0.88000000 * 100; totalSamplesSeen = 900; learningRatePerSample = 0.00015625; epochTime=0.012463s
-05/13/2016 15:10:52: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.9'
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution.9'
-05/13/2016 15:10:52: Starting Epoch 10: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+08/16/2016 10:50:41: Starting Epoch 10: learning rate per sample = 0.000156  effective momentum = 0.900000  momentum as time constant = 607.4 samples
 BlockRandomizer::StartEpoch: epoch 9: frames [900..1000] (first sequence at sample 900), data subset 0 of 1
-05/13/2016 15:10:52: Starting minibatch loop.
+08/16/2016 10:50:41: Starting minibatch loop.
 (GPU): creating curand object with seed 9
-05/13/2016 15:10:52: Finished Epoch[10 of 10]: [Training] CE = 2.29764435 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 1000; learningRatePerSample = 0.00015625; epochTime=0.012689s
+08/16/2016 10:50:41: Finished Epoch[10 of 10]: [Training] CE = 2.27814423 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 1000; learningRatePerSample = 0.00015625; epochTime=0.012432s
-05/13/2016 15:10:52: SGD: Saving checkpoint model '/tmp/cntk-test-20160513145544.775982/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution'
+08/16/2016 10:50:41: SGD: Saving checkpoint model '/tmp/cntk-test-20160816095502.258817/Examples/Image/Miscellaneous/CIFAR-10_01_Convolution@release_gpu/Models/01_Convolution'
-05/13/2016 15:10:52: CNTKCommandTrainEnd: Train
+08/16/2016 10:50:41: CNTKCommandTrainEnd: Train
-05/13/2016 15:10:52: Action "train" complete.
+08/16/2016 10:50:41: Action "train" complete.
-05/13/2016 15:10:52: ##############################################################################
+08/16/2016 10:50:41: ##############################################################################
-05/13/2016 15:10:52: #                                                                            #
+08/16/2016 10:50:41: #                                                                            #
-05/13/2016 15:10:52: # Action "test"                                                              #
+08/16/2016 10:50:41: # Action "test"                                                              #
-05/13/2016 15:10:52: #                                                                            #
+08/16/2016 10:50:41: #                                                                            #
-05/13/2016 15:10:52: ##############################################################################
+08/16/2016 10:50:41: ##############################################################################
 Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 34 nodes to process in pass 1.
@ -530,7 +582,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1_d) : [10 x 64], [64 x 1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *1], [10] -> [10 x 1 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating network. 21 nodes to process in pass 2.
@ -538,17 +590,17 @@ Validating network. 21 nodes to process in pass 2.
 Validating network, final pass.
-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv1_act.c: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv2_act.c: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool2: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv3_act.c: using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool3: using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
 13 out of 34 nodes do not share the minibatch layout with the input data.
@ -560,46 +612,14 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
 Allocating matrices for forward and/or backward propagation.
-Memory Sharing Structure:
+Memory Sharing: Out of 34 matrices, 0 are shared as 0, and 34 are not shared.
 (nil): {[CE Gradient[1]] [Err Gradient[1]] [OutputNodes.W Gradient[10 x 64]] [OutputNodes.b Gradient[10]] [OutputNodes.t Gradient[10 x 1 x *1]] [OutputNodes.z Gradient[10 x 1 x *1]] [conv1_act.W Gradient[32 x 75]] [conv1_act.b Gradient[1 x 1 x 32]] [conv1_act.c Gradient[32 x 32 x 32 x *1]] [conv1_act.p Gradient[32 x 32 x 32 x *1]] [conv1_act.y Gradient[32 x 32 x 32 x *1]] [conv2_act.W Gradient[32 x 800]] [conv2_act.b Gradient[1 x 1 x 32]] [conv2_act.c Gradient[15 x 15 x 32 x *1]] [conv2_act.p Gradient[15 x 15 x 32 x *1]] [conv2_act.y Gradient[15 x 15 x 32 x *1]] [conv3_act.W Gradient[64 x 800]] [conv3_act.b Gradient[1 x 1 x 64]] [conv3_act.c Gradient[7 x 7 x 64 x *1]] [conv3_act.p Gradient[7 x 7 x 64 x *1]] [conv3_act.y Gradient[7 x 7 x 64 x *1]] [featOffs Gradient[1 x 1]] [featScaled Gradient[32 x 32 x 3 x *1]] [features Gradient[32 x 32 x 3 x *1]] [h1.W Gradient[64 x 3 x 3 x 64]] [h1.b Gradient[64 x 1]] [h1.t Gradient[64 x *1]] [h1.y Gradient[64 x 1 x *1]] [h1.z Gradient[64 x 1 x *1]] [h1_d Gradient[64 x 1 x *1]] [labels Gradient[10 x *1]] [pool1 Gradient[15 x 15 x 32 x *1]] [pool2 Gradient[7 x 7 x 32 x *1]] [pool3 Gradient[3 x 3 x 64 x *1]] }
 0x7fc883e04ba8: {[conv1_act.b Value[1 x 1 x 32]] }
 0x7fc883e05fc8: {[conv1_act.W Value[32 x 75]] }
 0x7fc883e06768: {[conv2_act.b Value[1 x 1 x 32]] }
 0x7fc883e06928: {[conv2_act.W Value[32 x 800]] }
 0x7fc883e085b8: {[conv3_act.b Value[1 x 1 x 64]] }
 0x7fc883e09528: {[conv3_act.W Value[64 x 800]] }
 0x7fc883e0b568: {[featOffs Value[1 x 1]] }
 0x7fc883e0c1e8: {[features Value[32 x 32 x 3 x *1]] }
 0x7fc883e0cc38: {[h1.b Value[64 x 1]] }
 0x7fc883e0cf08: {[h1.W Value[64 x 3 x 3 x 64]] }
 0x7fc883e0eb48: {[labels Value[10 x *1]] }
 0x7fc883e0f558: {[OutputNodes.b Value[10]] }
 0x7fc883e10068: {[OutputNodes.W Value[10 x 64]] }
 0x7fc883e286b8: {[Err Value[1]] }
 0x7fc883e2bd28: {[CE Value[1]] }
 0x7fc883e2bfa8: {[conv1_act.y Value[32 x 32 x 32 x *1]] }
 0x7fc883e54728: {[conv1_act.c Value[32 x 32 x 32 x *1]] }
 0x7fc883e54a88: {[featScaled Value[32 x 32 x 3 x *1]] }
 0x7fc883e54c18: {[conv1_act.p Value[32 x 32 x 32 x *1]] }
 0x7fc883e71a78: {[pool1 Value[15 x 15 x 32 x *1]] }
 0x7fc883e71c38: {[conv2_act.c Value[15 x 15 x 32 x *1]] }
 0x7fc883e71fb8: {[conv2_act.p Value[15 x 15 x 32 x *1]] }
 0x7fc883e72178: {[conv2_act.y Value[15 x 15 x 32 x *1]] }
 0x7fc883e72338: {[pool2 Value[7 x 7 x 32 x *1]] }
 0x7fc883e724f8: {[conv3_act.c Value[7 x 7 x 64 x *1]] }
 0x7fc883e72878: {[conv3_act.p Value[7 x 7 x 64 x *1]] }
 0x7fc883e72a38: {[conv3_act.y Value[7 x 7 x 64 x *1]] }
 0x7fc883e72bf8: {[pool3 Value[3 x 3 x 64 x *1]] }
 0x7fc883e72db8: {[h1.t Value[64 x *1]] }
 0x7fc883e72f78: {[h1.z Value[64 x 1 x *1]] }
 0x7fc883e73138: {[h1.y Value[64 x 1 x *1]] }
 0x7fc883e732f8: {[h1_d Value[64 x 1 x *1]] }
 0x7fc883e73678: {[OutputNodes.t Value[10 x 1 x *1]] }
 0x7fc883e73838: {[OutputNodes.z Value[10 x 1 x *1]] }
-05/13/2016 15:10:58: Final Results: Minibatch[1-625]: Err = 0.86430000 * 10000; CE = 2.28476029 * 10000; perplexity = 9.82333117
+BlockRandomizer::StartEpoch: epoch 0: frames [0..10000] (first sequence at sample 0), data subset 0 of 1
 08/16/2016 10:50:43: Minibatch[1-500]: Err = 0.86125000 * 8000; CE = 2.28389484 * 8000
 08/16/2016 10:50:43: Minibatch[501-625]: Err = 0.86350000 * 2000; CE = 2.28027481 * 2000
 08/16/2016 10:50:43: Final Results: Minibatch[1-625]: Err = 0.86170000 * 10000; CE = 2.28317084 * 10000; perplexity = 9.80772986
-05/13/2016 15:10:58: Action "test" complete.
+08/16/2016 10:50:43: Action "test" complete.
-05/13/2016 15:10:58: __COMPLETED__
+08/16/2016 10:50:43: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution/baseline.windows.txt
@ -286,7 +286,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 34 nodes to process in pass 1.
@ -324,7 +324,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1_d) : [10 x 64], [64 x 1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *], [10] -> [10 x 1 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
 Validating network. 21 nodes to process in pass 2.
@ -356,7 +356,7 @@ Post-processing network complete.
 05/13/2016 08:17:53: Evaluation criterion node(s):
-05/13/2016 08:17:53: 	Err = ClassificationError
+05/13/2016 08:17:53: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -490,7 +490,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 34 nodes to process in pass 1.
@ -528,7 +528,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1_d) : [10 x 64], [64 x 1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *1], [10] -> [10 x 1 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
 Validating network. 21 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.linux.txt
@ -291,7 +291,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 45 nodes to process in pass 1.
@ -340,7 +340,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *]
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
 Validating network. 20 nodes to process in pass 2.
@ -380,7 +380,7 @@ Post-processing network complete.
 05/13/2016 15:10:59: Evaluation criterion node(s):
-05/13/2016 15:10:59: 	Err = ClassificationError
+05/13/2016 15:10:59: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -491,7 +491,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 45 nodes to process in pass 1.
@ -540,7 +540,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
 Validating network. 20 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt
@ -289,7 +289,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 45 nodes to process in pass 1.
@ -338,7 +338,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *]
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
 Validating network. 20 nodes to process in pass 2.
@ -378,7 +378,7 @@ Post-processing network complete.
 05/13/2016 08:18:26: Evaluation criterion node(s):
-05/13/2016 08:18:26: 	Err = ClassificationError
+05/13/2016 08:18:26: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -489,7 +489,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 45 nodes to process in pass 1.
@ -538,7 +538,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *1
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
 Validating network. 20 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/baseline.linux.gpu.txt
@ -356,7 +356,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 184 nodes to process in pass 1.
@ -546,7 +546,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
 Validating network. 75 nodes to process in pass 2.
@ -652,7 +652,7 @@ Post-processing network complete.
 05/03/2016 18:13:08: Evaluation criterion node(s):
-05/03/2016 18:13:08: 	Err = ClassificationError
+05/03/2016 18:13:08: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -907,7 +907,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 184 nodes to process in pass 1.
@ -1095,7 +1095,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
 Validating network. 75 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet/baseline.windows.txt
@ -354,7 +354,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 184 nodes to process in pass 1.
@ -544,7 +544,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
 Validating network. 75 nodes to process in pass 2.
@ -650,7 +650,7 @@ Post-processing network complete.
 05/03/2016 14:04:12: Evaluation criterion node(s):
-05/03/2016 14:04:12: 	Err = ClassificationError
+05/03/2016 14:04:12: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -905,7 +905,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 184 nodes to process in pass 1.
@ -1093,7 +1093,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
 Validating network. 75 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/baseline.linux.gpu.txt
@ -356,7 +356,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 949 nodes to process in pass 1.
@ -1311,7 +1311,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
 Validating network. 390 nodes to process in pass 2.
@ -1777,7 +1777,7 @@ Post-processing network complete.
 05/03/2016 18:17:55: Evaluation criterion node(s):
-05/03/2016 18:17:55: 	Err = ClassificationError
+05/03/2016 18:17:55: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -2932,7 +2932,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 949 nodes to process in pass 1.
@ -3885,7 +3885,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
 Validating network. 390 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56/baseline.windows.txt
@ -354,7 +354,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 949 nodes to process in pass 1.
@ -1309,7 +1309,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
 Validating network. 390 nodes to process in pass 2.
@ -1775,7 +1775,7 @@ Post-processing network complete.
 05/03/2016 14:05:00: Evaluation criterion node(s):
-05/03/2016 14:05:00: 	Err = ClassificationError
+05/03/2016 14:05:00: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -2930,7 +2930,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 949 nodes to process in pass 1.
@ -3883,7 +3883,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, pool) : [10 x 1 x 1 x 64],
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
 Validating network. 390 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal/baseline.linux.txt
@ -282,7 +282,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 32 nodes to process in pass 1.
@ -318,7 +318,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, conv4.y) : [10 x 7 x 7 x 32
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
 Validating network. 19 nodes to process in pass 2.
@ -350,7 +350,7 @@ Post-processing network complete.
 05/13/2016 15:11:11: Evaluation criterion node(s):
-05/13/2016 15:11:11: 	Err = ClassificationError
+05/13/2016 15:11:11: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -446,7 +446,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 32 nodes to process in pass 1.
@ -482,7 +482,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, conv4.y) : [10 x 7 x 7 x 32
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
 Validating network. 19 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal/baseline.windows.txt
@ -280,7 +280,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 32 nodes to process in pass 1.
@ -316,7 +316,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, conv4.y) : [10 x 7 x 7 x 32
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *], [10] -> [10 x *]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x *] -> [1]
 Validating network. 19 nodes to process in pass 2.
@ -348,7 +348,7 @@ Post-processing network complete.
 05/13/2016 08:19:02: Evaluation criterion node(s):
-05/13/2016 08:19:02: 	Err = ClassificationError
+05/13/2016 08:19:02: 	Err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
@ -444,7 +444,7 @@ Post-processing network...
 3 roots:
 	CE = CrossEntropyWithSoftmax()
-	Err = ClassificationError()
+	Err = ErrorPrediction()
 	OutputNodes.z = Plus()
 Validating network. 32 nodes to process in pass 1.
@ -480,7 +480,7 @@ Validating --> OutputNodes.t = Times (OutputNodes.W, conv4.y) : [10 x 7 x 7 x 32
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
 Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x *1], [10] -> [10 x *1]
 Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
-Validating --> Err = ClassificationError (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x *1] -> [1]
 Validating network. 19 nodes to process in pass 2.
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.linux.cpu.txt
@ -68,7 +68,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -169,7 +169,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -302,7 +302,7 @@ configparameters: Multigpu.cntk:Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -370,7 +370,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -399,7 +399,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -423,14 +423,14 @@ Post-processing network complete.
 05/03/2016 15:21:43: Evaluation criterion node(s):
-05/03/2016 15:21:43: 	EvalClassificationError = ClassificationError
+05/03/2016 15:21:43: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+(nil): {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 0x1abc7c8: {[InvStdOfFeatures Value[2]] }
 0x1b40348: {[features Value[2 x *]] }
 0x1b408b8: {[MeanOfFeatures Value[2]] }
@ -443,7 +443,7 @@ Memory Sharing Structure:
 0x1b46708: {[labels Value[2 x *]] }
 0x1b473e8: {[Prior Value[2]] }
 0x1b4b138: {[ScaledLogLikelihood Value[2 x 1 x *]] }
-0x1b4cc28: {[EvalClassificationError Value[1]] }
+0x1b4cc28: {[EvalErrorPrediction Value[1]] }
 0x1b4cea8: {[CrossEntropyWithSoftmax Value[1]] }
 0x1b4d388: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
 0x1b4d548: {[W0*features+B0 Gradient[50 x 1 x *]] [W1*H1 Value[50 x 1 x *]] }
@ -473,139 +473,139 @@ Memory Sharing Structure:
 05/03/2016 15:21:44: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:44: Starting minibatch loop.
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.69966235 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0538s; samplesPerSecond = 4647.4
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.69966235 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0538s; samplesPerSecond = 4647.4
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70639648 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.1073s; samplesPerSecond = 2329.6
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70639648 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.1073s; samplesPerSecond = 2329.6
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70470264 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0631s; samplesPerSecond = 3961.3
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70470264 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0631s; samplesPerSecond = 3961.3
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69813501 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0747s; samplesPerSecond = 3346.9
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69813501 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0747s; samplesPerSecond = 3346.9
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73551416 * 250; EvalClassificationError = 0.57600000 * 250; time = 0.0900s; samplesPerSecond = 2778.4
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73551416 * 250; EvalErrorPrediction = 0.57600000 * 250; time = 0.0900s; samplesPerSecond = 2778.4
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72432324 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0605s; samplesPerSecond = 4135.0
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72432324 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0605s; samplesPerSecond = 4135.0
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73327588 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0619s; samplesPerSecond = 4039.0
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73327588 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0619s; samplesPerSecond = 4039.0
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70092627 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0769s; samplesPerSecond = 3249.9
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70092627 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0769s; samplesPerSecond = 3249.9
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72354980 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0799s; samplesPerSecond = 3129.0
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72354980 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0799s; samplesPerSecond = 3129.0
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72148096 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0620s; samplesPerSecond = 4031.5
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72148096 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0620s; samplesPerSecond = 4031.5
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69814941 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.1278s; samplesPerSecond = 1955.9
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69814941 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.1278s; samplesPerSecond = 1955.9
-05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70699121 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0821s; samplesPerSecond = 3044.1
+05/03/2016 15:21:44:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70699121 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0821s; samplesPerSecond = 3044.1
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69898437 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0755s; samplesPerSecond = 3312.4
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69898437 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0755s; samplesPerSecond = 3312.4
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71712695 * 250; EvalClassificationError = 0.54000000 * 250; time = 0.0657s; samplesPerSecond = 3804.8
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71712695 * 250; EvalErrorPrediction = 0.54000000 * 250; time = 0.0657s; samplesPerSecond = 3804.8
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69470703 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.1049s; samplesPerSecond = 2382.9
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69470703 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.1049s; samplesPerSecond = 2382.9
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71375879 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.1180s; samplesPerSecond = 2117.9
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71375879 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.1180s; samplesPerSecond = 2117.9
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70381641 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.1065s; samplesPerSecond = 2347.9
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70381641 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.1065s; samplesPerSecond = 2347.9
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71748633 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.2709s; samplesPerSecond = 922.9
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71748633 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.2709s; samplesPerSecond = 922.9
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71863281 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.1375s; samplesPerSecond = 1818.4
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71863281 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.1375s; samplesPerSecond = 1818.4
-05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70715234 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.1143s; samplesPerSecond = 2186.6
+05/03/2016 15:21:45:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70715234 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.1143s; samplesPerSecond = 2186.6
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70401074 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.1079s; samplesPerSecond = 2317.1
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70401074 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.1079s; samplesPerSecond = 2317.1
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70599414 * 250; EvalClassificationError = 0.48400000 * 250; time = 0.0917s; samplesPerSecond = 2727.7
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70599414 * 250; EvalErrorPrediction = 0.48400000 * 250; time = 0.0917s; samplesPerSecond = 2727.7
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69628711 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0923s; samplesPerSecond = 2707.6
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69628711 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0923s; samplesPerSecond = 2707.6
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75920898 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0887s; samplesPerSecond = 2819.0
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75920898 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0887s; samplesPerSecond = 2819.0
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70542578 * 250; EvalClassificationError = 0.43600000 * 250; time = 0.0634s; samplesPerSecond = 3945.8
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70542578 * 250; EvalErrorPrediction = 0.43600000 * 250; time = 0.0634s; samplesPerSecond = 3945.8
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70643945 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0885s; samplesPerSecond = 2823.7
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70643945 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0885s; samplesPerSecond = 2823.7
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72481641 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0601s; samplesPerSecond = 4162.6
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72481641 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0601s; samplesPerSecond = 4162.6
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71133594 * 250; EvalClassificationError = 0.55600000 * 250; time = 0.0630s; samplesPerSecond = 3968.1
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71133594 * 250; EvalErrorPrediction = 0.55600000 * 250; time = 0.0630s; samplesPerSecond = 3968.1
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68605664 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0849s; samplesPerSecond = 2944.1
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68605664 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0849s; samplesPerSecond = 2944.1
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69535352 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0879s; samplesPerSecond = 2844.6
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69535352 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0879s; samplesPerSecond = 2844.6
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.68741797 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0752s; samplesPerSecond = 3325.7
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.68741797 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0752s; samplesPerSecond = 3325.7
-05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.67916406 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0958s; samplesPerSecond = 2610.3
+05/03/2016 15:21:46:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.67916406 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0958s; samplesPerSecond = 2610.3
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.67841992 * 250; EvalClassificationError = 0.44800000 * 250; time = 0.1009s; samplesPerSecond = 2478.7
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.67841992 * 250; EvalErrorPrediction = 0.44800000 * 250; time = 0.1009s; samplesPerSecond = 2478.7
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68038477 * 250; EvalClassificationError = 0.49200000 * 250; time = 0.1607s; samplesPerSecond = 1555.6
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68038477 * 250; EvalErrorPrediction = 0.49200000 * 250; time = 0.1607s; samplesPerSecond = 1555.6
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.61937109 * 250; EvalClassificationError = 0.30400000 * 250; time = 0.1131s; samplesPerSecond = 2211.4
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.61937109 * 250; EvalErrorPrediction = 0.30400000 * 250; time = 0.1131s; samplesPerSecond = 2211.4
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.57844141 * 250; EvalClassificationError = 0.27200000 * 250; time = 0.1047s; samplesPerSecond = 2388.5
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.57844141 * 250; EvalErrorPrediction = 0.27200000 * 250; time = 0.1047s; samplesPerSecond = 2388.5
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.49124023 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0896s; samplesPerSecond = 2791.5
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.49124023 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0896s; samplesPerSecond = 2791.5
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.39071289 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0727s; samplesPerSecond = 3438.8
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.39071289 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0727s; samplesPerSecond = 3438.8
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.27650586 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.2624s; samplesPerSecond = 952.6
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.27650586 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.2624s; samplesPerSecond = 952.6
-05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.26430078 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0842s; samplesPerSecond = 2967.7
+05/03/2016 15:21:47:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.26430078 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0842s; samplesPerSecond = 2967.7
-05/03/2016 15:21:47: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.66664150 * 10000; EvalClassificationError = 0.44430000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=3.93174s
+05/03/2016 15:21:47: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.66664150 * 10000; EvalErrorPrediction = 0.44430000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=3.93174s
 05/03/2016 15:21:47: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn.1'
 05/03/2016 15:21:47: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:47: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.20720006 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0545s; samplesPerSecond = 4583.4
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.20720006 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0545s; samplesPerSecond = 4583.4
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.19690290 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0641s; samplesPerSecond = 3899.7
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.19690290 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0641s; samplesPerSecond = 3899.7
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.16064646 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0770s; samplesPerSecond = 3247.1
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.16064646 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0770s; samplesPerSecond = 3247.1
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13547171 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0640s; samplesPerSecond = 3904.2
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13547171 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0640s; samplesPerSecond = 3904.2
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.18000261 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0732s; samplesPerSecond = 3413.6
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.18000261 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0732s; samplesPerSecond = 3413.6
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17787841 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0790s; samplesPerSecond = 3164.0
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17787841 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0790s; samplesPerSecond = 3164.0
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16821879 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0880s; samplesPerSecond = 2839.4
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16821879 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0880s; samplesPerSecond = 2839.4
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16363456 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0854s; samplesPerSecond = 2926.8
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16363456 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0854s; samplesPerSecond = 2926.8
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19533907 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0774s; samplesPerSecond = 3228.6
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19533907 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0774s; samplesPerSecond = 3228.6
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19318692 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0820s; samplesPerSecond = 3049.5
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19318692 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0820s; samplesPerSecond = 3049.5
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.12726279 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0766s; samplesPerSecond = 3261.6
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.12726279 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0766s; samplesPerSecond = 3261.6
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.18620067 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0773s; samplesPerSecond = 3235.5
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.18620067 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0773s; samplesPerSecond = 3235.5
-05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.11547500 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0797s; samplesPerSecond = 3136.6
+05/03/2016 15:21:48:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.11547500 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0797s; samplesPerSecond = 3136.6
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16675950 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0833s; samplesPerSecond = 2999.8
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16675950 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0833s; samplesPerSecond = 2999.8
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.15807389 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0822s; samplesPerSecond = 3042.5
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.15807389 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0822s; samplesPerSecond = 3042.5
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18389093 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0726s; samplesPerSecond = 3443.0
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18389093 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0726s; samplesPerSecond = 3443.0
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18269750 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0897s; samplesPerSecond = 2787.7
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18269750 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0897s; samplesPerSecond = 2787.7
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18737841 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0963s; samplesPerSecond = 2597.3
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18737841 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0963s; samplesPerSecond = 2597.3
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20174757 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0811s; samplesPerSecond = 3081.1
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20174757 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0811s; samplesPerSecond = 3081.1
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13336708 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0732s; samplesPerSecond = 3414.6
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13336708 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0732s; samplesPerSecond = 3414.6
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13851332 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0879s; samplesPerSecond = 2843.0
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13851332 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0879s; samplesPerSecond = 2843.0
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15422288 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0821s; samplesPerSecond = 3044.3
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15422288 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0821s; samplesPerSecond = 3044.3
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15478799 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0815s; samplesPerSecond = 3069.2
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15478799 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0815s; samplesPerSecond = 3069.2
-05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14530201 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0810s; samplesPerSecond = 3086.3
+05/03/2016 15:21:49:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14530201 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0810s; samplesPerSecond = 3086.3
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12192809 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.2596s; samplesPerSecond = 962.9
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12192809 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.2596s; samplesPerSecond = 962.9
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.13975597 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0569s; samplesPerSecond = 4394.5
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.13975597 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0569s; samplesPerSecond = 4394.5
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12566363 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0911s; samplesPerSecond = 2744.6
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12566363 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0911s; samplesPerSecond = 2744.6
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18963051 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0765s; samplesPerSecond = 3267.2
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18963051 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0765s; samplesPerSecond = 3267.2
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17955467 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0914s; samplesPerSecond = 2736.4
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17955467 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0914s; samplesPerSecond = 2736.4
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18862103 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0772s; samplesPerSecond = 3236.7
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18862103 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0772s; samplesPerSecond = 3236.7
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17503073 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0775s; samplesPerSecond = 3225.8
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17503073 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0775s; samplesPerSecond = 3225.8
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14741998 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0774s; samplesPerSecond = 3230.1
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14741998 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0774s; samplesPerSecond = 3230.1
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13803981 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0726s; samplesPerSecond = 3443.0
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13803981 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0726s; samplesPerSecond = 3443.0
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.14139232 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0820s; samplesPerSecond = 3048.4
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.14139232 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0820s; samplesPerSecond = 3048.4
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13886877 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0766s; samplesPerSecond = 3264.1
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13886877 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0766s; samplesPerSecond = 3264.1
-05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.15025864 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0852s; samplesPerSecond = 2933.5
+05/03/2016 15:21:50:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.15025864 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0852s; samplesPerSecond = 2933.5
-05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14659342 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0903s; samplesPerSecond = 2767.4
+05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14659342 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0903s; samplesPerSecond = 2767.4
-05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13078795 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0784s; samplesPerSecond = 3187.6
+05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13078795 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0784s; samplesPerSecond = 3187.6
-05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19832882 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0772s; samplesPerSecond = 3240.4
+05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19832882 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0772s; samplesPerSecond = 3240.4
-05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15828904 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0721s; samplesPerSecond = 3468.7
+05/03/2016 15:21:51:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15828904 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0721s; samplesPerSecond = 3468.7
-05/03/2016 15:21:51: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16210811 * 10000; EvalClassificationError = 0.07480000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=3.34279s
+05/03/2016 15:21:51: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16210811 * 10000; EvalErrorPrediction = 0.07480000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=3.34279s
 05/03/2016 15:21:51: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn.2'
 05/03/2016 15:21:51: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:51: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.19031988 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0960s; samplesPerSecond = 2604.5
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.19031988 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0960s; samplesPerSecond = 2604.5
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13920714 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0967s; samplesPerSecond = 2585.3
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13920714 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0967s; samplesPerSecond = 2585.3
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14595162 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0869s; samplesPerSecond = 2877.8
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14595162 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0869s; samplesPerSecond = 2877.8
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13324012 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0817s; samplesPerSecond = 3060.5
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13324012 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0817s; samplesPerSecond = 3060.5
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17358728 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0804s; samplesPerSecond = 3109.2
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17358728 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0804s; samplesPerSecond = 3109.2
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17949159 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0660s; samplesPerSecond = 3788.1
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17949159 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0660s; samplesPerSecond = 3788.1
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.15009323 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0653s; samplesPerSecond = 3829.5
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.15009323 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0653s; samplesPerSecond = 3829.5
-05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17060954 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0660s; samplesPerSecond = 3787.3
+05/03/2016 15:21:51:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17060954 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0660s; samplesPerSecond = 3787.3
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10410764 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0762s; samplesPerSecond = 3280.0
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10410764 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0762s; samplesPerSecond = 3280.0
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20572259 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.2571s; samplesPerSecond = 972.5
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20572259 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.2571s; samplesPerSecond = 972.5
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16519130 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0640s; samplesPerSecond = 3906.2
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16519130 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0640s; samplesPerSecond = 3906.2
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.14908187 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0593s; samplesPerSecond = 4213.2
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.14908187 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0593s; samplesPerSecond = 4213.2
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19227612 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0688s; samplesPerSecond = 3632.8
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19227612 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0688s; samplesPerSecond = 3632.8
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13670934 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0532s; samplesPerSecond = 4700.3
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13670934 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0532s; samplesPerSecond = 4700.3
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.21113164 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0693s; samplesPerSecond = 3609.4
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.21113164 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0693s; samplesPerSecond = 3609.4
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.13129944 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0882s; samplesPerSecond = 2833.6
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.13129944 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0882s; samplesPerSecond = 2833.6
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17304376 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0840s; samplesPerSecond = 2975.2
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17304376 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0840s; samplesPerSecond = 2975.2
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16479250 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0685s; samplesPerSecond = 3648.5
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16479250 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0685s; samplesPerSecond = 3648.5
-05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14591786 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0976s; samplesPerSecond = 2561.0
+05/03/2016 15:21:52:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14591786 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0976s; samplesPerSecond = 2561.0
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12562012 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0969s; samplesPerSecond = 2580.7
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12562012 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0969s; samplesPerSecond = 2580.7
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13442773 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0959s; samplesPerSecond = 2607.8
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13442773 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0959s; samplesPerSecond = 2607.8
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.17125328 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0754s; samplesPerSecond = 3314.6
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.17125328 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0754s; samplesPerSecond = 3314.6
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22482522 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.1037s; samplesPerSecond = 2410.8
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22482522 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.1037s; samplesPerSecond = 2410.8
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18291792 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0650s; samplesPerSecond = 3844.3
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18291792 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0650s; samplesPerSecond = 3844.3
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.20296558 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0823s; samplesPerSecond = 3038.9
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.20296558 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0823s; samplesPerSecond = 3038.9
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22849719 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0828s; samplesPerSecond = 3020.2
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22849719 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0828s; samplesPerSecond = 3020.2
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12500068 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0864s; samplesPerSecond = 2894.1
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12500068 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0864s; samplesPerSecond = 2894.1
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15719802 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0840s; samplesPerSecond = 2976.4
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15719802 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0840s; samplesPerSecond = 2976.4
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11520810 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0687s; samplesPerSecond = 3636.7
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11520810 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0687s; samplesPerSecond = 3636.7
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14159592 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0974s; samplesPerSecond = 2567.1
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14159592 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0974s; samplesPerSecond = 2567.1
-05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18509569 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0721s; samplesPerSecond = 3465.4
+05/03/2016 15:21:53:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18509569 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0721s; samplesPerSecond = 3465.4
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15008345 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0905s; samplesPerSecond = 2763.6
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15008345 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0905s; samplesPerSecond = 2763.6
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12866435 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0902s; samplesPerSecond = 2770.5
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12866435 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0902s; samplesPerSecond = 2770.5
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17640526 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0896s; samplesPerSecond = 2789.2
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17640526 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0896s; samplesPerSecond = 2789.2
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14982110 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.2845s; samplesPerSecond = 878.8
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14982110 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.2845s; samplesPerSecond = 878.8
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11472753 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0867s; samplesPerSecond = 2882.5
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11472753 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0867s; samplesPerSecond = 2882.5
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16524783 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0755s; samplesPerSecond = 3312.4
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16524783 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0755s; samplesPerSecond = 3312.4
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14961037 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0958s; samplesPerSecond = 2608.8
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14961037 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0958s; samplesPerSecond = 2608.8
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.15972387 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0972s; samplesPerSecond = 2572.7
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.15972387 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0972s; samplesPerSecond = 2572.7
-05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17867958 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0969s; samplesPerSecond = 2581.0
+05/03/2016 15:21:54:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17867958 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0969s; samplesPerSecond = 2581.0
-05/03/2016 15:21:54: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.16073358 * 10000; EvalClassificationError = 0.07780000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=3.65495s
+05/03/2016 15:21:54: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.16073358 * 10000; EvalErrorPrediction = 0.07780000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=3.65495s
 05/03/2016 15:21:54: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn'
 05/03/2016 15:21:54: CNTKCommandTrainEnd: Multigpu_Demo_Train
@ -623,7 +623,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -652,7 +652,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -676,7 +676,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0x1abbf28: {[B0 Value[50 x 1]] }
 0x1b47908: {[W1 Value[50 x 50]] }
 0x1b48278: {[W2 Value[2 x 50]] }
@ -688,7 +688,7 @@ Memory Sharing Structure:
 0x1b50cd8: {[Prior Value[2]] }
 0x1b514f8: {[W0 Value[50 x 2]] }
 0x1b53938: {[B1 Value[50 x 1]] }
-0x1c0fd98: {[EvalClassificationError Value[1]] }
+0x1c0fd98: {[EvalErrorPrediction Value[1]] }
 0x1c0fef8: {[CrossEntropyWithSoftmax Value[1]] }
 0x1c10438: {[LogOfPrior Value[2]] }
 0x1c11f48: {[MVNormalizedFeatures Value[2 x *1]] }
@ -701,7 +701,7 @@ Memory Sharing Structure:
 0x1c12d78: {[W2*H1 Value[2 x 1 x *1]] }
 0x1c12f38: {[HLast Value[2 x 1 x *1]] }
-05/03/2016 15:21:55: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05804312 * 603; CrossEntropyWithSoftmax = 0.12790061 * 603; perplexity = 1.13644005
+05/03/2016 15:21:55: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05804312 * 603; CrossEntropyWithSoftmax = 0.12790061 * 603; perplexity = 1.13644005
 05/03/2016 15:21:55: Action "test" complete.
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.linux.gpu.txt
@ -68,7 +68,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -169,7 +169,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -302,7 +302,7 @@ configparameters: Multigpu.cntk:Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -371,7 +371,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -400,7 +400,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -424,14 +424,14 @@ Post-processing network complete.
 05/03/2016 15:21:55: Evaluation criterion node(s):
-05/03/2016 15:21:55: 	EvalClassificationError = ClassificationError
+05/03/2016 15:21:55: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+(nil): {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 0x12a62e8: {[features Value[2 x *]] }
 0x20202b8: {[MeanOfFeatures Value[2]] }
 0x20207c8: {[InvStdOfFeatures Value[2]] }
@ -444,7 +444,7 @@ Memory Sharing Structure:
 0x278ae18: {[Prior Value[2]] }
 0x278c158: {[LogOfPrior Value[2]] }
 0x27908f8: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
-0x2790a18: {[EvalClassificationError Value[1]] }
+0x2790a18: {[EvalErrorPrediction Value[1]] }
 0x2790d18: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 0x2790e78: {[CrossEntropyWithSoftmax Value[1]] }
 0x27966e8: {[B0 Value[50 x 1]] }
@ -474,139 +474,139 @@ Memory Sharing Structure:
 05/03/2016 15:21:56: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:56: Starting minibatch loop.
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70004456 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0059s; samplesPerSecond = 42038.0
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70004456 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0059s; samplesPerSecond = 42038.0
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70309900 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0049s; samplesPerSecond = 50525.5
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70309900 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0049s; samplesPerSecond = 50525.5
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70606104 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0050s; samplesPerSecond = 50423.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70606104 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0050s; samplesPerSecond = 50423.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69845532 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0049s; samplesPerSecond = 50689.4
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69845532 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0049s; samplesPerSecond = 50689.4
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73496533 * 250; EvalClassificationError = 0.57600000 * 250; time = 0.0050s; samplesPerSecond = 50261.4
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73496533 * 250; EvalErrorPrediction = 0.57600000 * 250; time = 0.0050s; samplesPerSecond = 50261.4
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72522827 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0050s; samplesPerSecond = 50454.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72522827 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0050s; samplesPerSecond = 50454.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73287500 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0049s; samplesPerSecond = 50576.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73287500 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0049s; samplesPerSecond = 50576.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70135547 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0049s; samplesPerSecond = 50566.3
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70135547 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0049s; samplesPerSecond = 50566.3
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72466504 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0049s; samplesPerSecond = 50515.3
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72466504 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0049s; samplesPerSecond = 50515.3
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72187500 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0049s; samplesPerSecond = 50730.5
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72187500 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0049s; samplesPerSecond = 50730.5
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69799023 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0049s; samplesPerSecond = 50751.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69799023 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0049s; samplesPerSecond = 50751.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70696387 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0050s; samplesPerSecond = 50454.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70696387 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0050s; samplesPerSecond = 50454.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69863965 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0050s; samplesPerSecond = 50393.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69863965 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0050s; samplesPerSecond = 50393.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71772461 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0048s; samplesPerSecond = 51899.5
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71772461 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0048s; samplesPerSecond = 51899.5
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69526270 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0047s; samplesPerSecond = 53544.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69526270 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0047s; samplesPerSecond = 53544.7
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71436426 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0047s; samplesPerSecond = 53498.8
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71436426 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0047s; samplesPerSecond = 53498.8
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70399316 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0047s; samplesPerSecond = 53694.2
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70399316 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0047s; samplesPerSecond = 53694.2
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71745508 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 53879.3
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71745508 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 53879.3
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71963184 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0047s; samplesPerSecond = 53521.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71963184 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0047s; samplesPerSecond = 53521.7
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70689941 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0047s; samplesPerSecond = 53602.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70689941 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0047s; samplesPerSecond = 53602.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70425098 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 53890.9
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70425098 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 53890.9
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70622754 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0047s; samplesPerSecond = 53728.8
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70622754 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0047s; samplesPerSecond = 53728.8
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69729492 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 53786.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69729492 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 53786.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75974219 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54265.2
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75974219 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54265.2
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70631250 * 250; EvalClassificationError = 0.43600000 * 250; time = 0.0047s; samplesPerSecond = 53659.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70631250 * 250; EvalErrorPrediction = 0.43600000 * 250; time = 0.0047s; samplesPerSecond = 53659.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70705664 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0047s; samplesPerSecond = 53602.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70705664 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0047s; samplesPerSecond = 53602.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72660352 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54124.3
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72660352 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54124.3
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71369727 * 250; EvalClassificationError = 0.55600000 * 250; time = 0.0047s; samplesPerSecond = 53441.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71369727 * 250; EvalErrorPrediction = 0.55600000 * 250; time = 0.0047s; samplesPerSecond = 53441.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68916602 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0047s; samplesPerSecond = 53659.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68916602 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0047s; samplesPerSecond = 53659.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69964844 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0047s; samplesPerSecond = 53339.0
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69964844 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0047s; samplesPerSecond = 53339.0
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.69387891 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 53832.9
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.69387891 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 53832.9
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.68885742 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0047s; samplesPerSecond = 53350.4
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.68885742 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0047s; samplesPerSecond = 53350.4
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69388867 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0047s; samplesPerSecond = 53430.2
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69388867 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0047s; samplesPerSecond = 53430.2
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.70363867 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 53960.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.70363867 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 53960.7
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65449219 * 250; EvalClassificationError = 0.44400000 * 250; time = 0.0047s; samplesPerSecond = 53544.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65449219 * 250; EvalErrorPrediction = 0.44400000 * 250; time = 0.0047s; samplesPerSecond = 53544.7
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64607031 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0047s; samplesPerSecond = 53453.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64607031 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0047s; samplesPerSecond = 53453.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.59492969 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 53972.4
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.59492969 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 53972.4
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.53965820 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0047s; samplesPerSecond = 53636.6
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.53965820 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0047s; samplesPerSecond = 53636.6
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.43681445 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0047s; samplesPerSecond = 52854.1
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.43681445 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0047s; samplesPerSecond = 52854.1
-05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37407422 * 250; EvalClassificationError = 0.12000000 * 250; time = 0.0047s; samplesPerSecond = 53521.7
+05/03/2016 15:21:56:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37407422 * 250; EvalErrorPrediction = 0.12000000 * 250; time = 0.0047s; samplesPerSecond = 53521.7
-05/03/2016 15:21:56: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68409629 * 10000; EvalClassificationError = 0.45780000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.194983s
+05/03/2016 15:21:56: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68409629 * 10000; EvalErrorPrediction = 0.45780000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.194983s
 05/03/2016 15:21:56: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn.1'
 05/03/2016 15:21:56: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:56: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.27919647 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0093s; samplesPerSecond = 26818.3
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.27919647 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0093s; samplesPerSecond = 26818.3
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.24468611 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0080s; samplesPerSecond = 31063.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.24468611 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0080s; samplesPerSecond = 31063.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.19639892 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30982.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.19639892 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30982.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16397861 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0080s; samplesPerSecond = 31222.7
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16397861 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0080s; samplesPerSecond = 31222.7
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.19745002 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30944.4
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.19745002 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30944.4
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.19548896 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0081s; samplesPerSecond = 30871.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.19548896 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0081s; samplesPerSecond = 30871.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.18230148 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30910.0
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.18230148 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0081s; samplesPerSecond = 30910.0
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17531255 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0080s; samplesPerSecond = 31059.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17531255 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0080s; samplesPerSecond = 31059.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20166559 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0081s; samplesPerSecond = 30944.4
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20166559 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0081s; samplesPerSecond = 30944.4
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19749058 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0081s; samplesPerSecond = 31055.9
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19749058 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0081s; samplesPerSecond = 31055.9
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.13463336 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0081s; samplesPerSecond = 30963.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.13463336 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0081s; samplesPerSecond = 30963.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.19006259 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0080s; samplesPerSecond = 31063.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.19006259 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0080s; samplesPerSecond = 31063.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.12234776 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0079s; samplesPerSecond = 31605.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.12234776 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0079s; samplesPerSecond = 31605.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16962922 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32649.9
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16962922 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32649.9
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16091639 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32743.9
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16091639 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32743.9
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18624030 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32748.2
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18624030 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32748.2
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18465726 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18465726 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18514518 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0077s; samplesPerSecond = 32620.0
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18514518 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0077s; samplesPerSecond = 32620.0
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20127224 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20127224 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13418547 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32701.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13418547 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32701.1
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13995001 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13995001 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15602538 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32907.7
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15602538 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32907.7
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15448171 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32864.5
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15448171 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32864.5
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14780067 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32894.7
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14780067 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32894.7
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12361633 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0077s; samplesPerSecond = 32628.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12361633 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0077s; samplesPerSecond = 32628.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14079766 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32632.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14079766 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32632.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12624363 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12624363 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18913222 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32894.7
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18913222 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32894.7
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17952681 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0076s; samplesPerSecond = 32786.9
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17952681 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0076s; samplesPerSecond = 32786.9
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18825452 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18825452 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17517656 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32942.4
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17517656 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32942.4
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14744161 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14744161 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13888184 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32795.5
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13888184 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32795.5
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.14156678 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32855.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.14156678 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32855.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13990591 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0077s; samplesPerSecond = 32607.3
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13990591 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0077s; samplesPerSecond = 32607.3
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.15059729 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32855.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.15059729 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32855.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14720846 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0076s; samplesPerSecond = 32799.8
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14720846 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0076s; samplesPerSecond = 32799.8
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13021243 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32912.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13021243 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32912.1
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19704037 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0076s; samplesPerSecond = 33029.5
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19704037 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0076s; samplesPerSecond = 33029.5
-05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15858146 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32860.1
+05/03/2016 15:21:56:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15858146 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32860.1
-05/03/2016 15:21:56: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16938752 * 10000; EvalClassificationError = 0.07430000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.313881s
+05/03/2016 15:21:56: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16938752 * 10000; EvalErrorPrediction = 0.07430000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.313881s
 05/03/2016 15:21:56: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn.2'
 05/03/2016 15:21:56: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:56: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18888809 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0078s; samplesPerSecond = 32129.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18888809 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0078s; samplesPerSecond = 32129.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.14084978 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32756.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.14084978 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32756.8
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14561895 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32666.9
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14561895 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32666.9
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13238169 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32752.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13238169 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0076s; samplesPerSecond = 32752.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17465335 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32765.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17465335 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32765.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17752616 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32821.3
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17752616 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32821.3
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.15030556 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0077s; samplesPerSecond = 32645.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.15030556 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0077s; samplesPerSecond = 32645.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17118019 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0077s; samplesPerSecond = 32611.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17118019 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0077s; samplesPerSecond = 32611.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10379908 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0077s; samplesPerSecond = 32637.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10379908 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0077s; samplesPerSecond = 32637.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20636150 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32782.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20636150 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32782.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16606704 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0077s; samplesPerSecond = 32543.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16606704 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0077s; samplesPerSecond = 32543.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.14937580 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32446.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.14937580 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0077s; samplesPerSecond = 32446.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19161901 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32731.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19161901 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32731.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13684752 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32696.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13684752 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32696.8
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.21095939 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32688.3
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.21095939 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32688.3
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.13216461 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32769.7
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.13216461 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32769.7
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17341094 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0077s; samplesPerSecond = 32586.0
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17341094 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0077s; samplesPerSecond = 32586.0
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16532641 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0076s; samplesPerSecond = 32868.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16532641 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0076s; samplesPerSecond = 32868.8
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14614740 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32696.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14614740 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0076s; samplesPerSecond = 32696.8
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12551177 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32705.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12551177 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32705.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13419939 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32782.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13419939 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32782.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.17050096 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.17050096 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32899.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22579789 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22579789 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18219666 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0078s; samplesPerSecond = 32220.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18219666 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0078s; samplesPerSecond = 32220.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.20347898 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.20347898 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32791.2
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22972656 * 250; EvalClassificationError = 0.12000000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22972656 * 250; EvalErrorPrediction = 0.12000000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12621914 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0076s; samplesPerSecond = 32890.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12621914 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0076s; samplesPerSecond = 32890.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15674728 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32808.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15674728 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32808.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11517532 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0077s; samplesPerSecond = 32658.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11517532 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0077s; samplesPerSecond = 32658.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14187870 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32860.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14187870 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32860.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18496784 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32929.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18496784 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0076s; samplesPerSecond = 32929.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15026403 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32942.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15026403 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32942.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12862609 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32925.1
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12862609 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32925.1
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17651362 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32778.3
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17651362 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0076s; samplesPerSecond = 32778.3
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14975908 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0076s; samplesPerSecond = 32981.5
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14975908 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0076s; samplesPerSecond = 32981.5
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11465866 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11465866 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0076s; samplesPerSecond = 32838.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16513610 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32808.4
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16513610 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0076s; samplesPerSecond = 32808.4
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14972374 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32977.2
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14972374 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0076s; samplesPerSecond = 32977.2
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.15995582 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.15995582 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0076s; samplesPerSecond = 32825.6
-05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17898927 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32756.8
+05/03/2016 15:21:56:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17898927 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0076s; samplesPerSecond = 32756.8
-05/03/2016 15:21:56: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.16083773 * 10000; EvalClassificationError = 0.07760000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.307973s
+05/03/2016 15:21:56: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.16083773 * 10000; EvalErrorPrediction = 0.07760000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.307973s
 05/03/2016 15:21:56: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152142.598996/CNTKTextFormatReader/Examples/Other/Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn'
 05/03/2016 15:21:56: CNTKCommandTrainEnd: Multigpu_Demo_Train
@ -624,7 +624,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -653,7 +653,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -677,7 +677,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0x1222268: {[InvStdOfFeatures Value[2]] }
 0x1223258: {[W2 Value[2 x 50]] }
 0x12a56c8: {[B0 Value[50 x 1]] }
@ -697,12 +697,12 @@ Memory Sharing Structure:
 0x2adcc08: {[W0*features Value[50 x *1]] }
 0x2add0a8: {[W0 Value[50 x 2]] }
 0x2ae0518: {[W1 Value[50 x 50]] }
-0x68bf228: {[EvalClassificationError Value[1]] }
+0x68bf228: {[EvalErrorPrediction Value[1]] }
 0x68bf388: {[CrossEntropyWithSoftmax Value[1]] }
 0x68bf988: {[LogOfPrior Value[2]] }
 0x68d0438: {[features Value[2 x *1]] }
-05/03/2016 15:21:57: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05804312 * 603; CrossEntropyWithSoftmax = 0.12736577 * 603; perplexity = 1.13583240
+05/03/2016 15:21:57: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05804312 * 603; CrossEntropyWithSoftmax = 0.12736577 * 603; perplexity = 1.13583240
 05/03/2016 15:21:57: Action "test" complete.
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.windows.cpu.txt
@ -66,7 +66,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -167,7 +167,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -300,7 +300,7 @@ configparameters: Multigpu.cntk:Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -368,7 +368,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -397,7 +397,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -421,14 +421,14 @@ Post-processing network complete.
 05/03/2016 15:29:48: Evaluation criterion node(s):
-05/03/2016 15:29:48: 	EvalClassificationError = ClassificationError
+05/03/2016 15:29:48: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+0000000000000000: {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 000000CDDFBEECA0: {[features Value[2 x *]] }
 000000CDDFC7B170: {[W0*features+B0 Gradient[50 x 1 x *]] [W1*H1 Value[50 x 1 x *]] }
 000000CDDFC7B490: {[HLast Value[2 x 1 x *]] [W2 Gradient[2 x 50]] }
@ -438,7 +438,7 @@ Memory Sharing Structure:
 000000CDDFC7B990: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
 000000CDDFC7BC10: {[LogOfPrior Value[2]] }
 000000CDDFC7BCB0: {[MVNormalizedFeatures Value[2 x *]] }
-000000CDDFC7BD50: {[EvalClassificationError Value[1]] }
+000000CDDFC7BD50: {[EvalErrorPrediction Value[1]] }
 000000CDDFC7BDF0: {[W0 Gradient[50 x 2]] [W0*features+B0 Value[50 x 1 x *]] }
 000000CDDFC7BF30: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 000000CDDFC7C070: {[H2 Value[50 x 1 x *]] [W1*H1 Gradient[50 x 1 x *]] }
@ -471,139 +471,139 @@ Memory Sharing Structure:
 05/03/2016 15:29:48: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:29:48: Starting minibatch loop.
-05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70511987 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0377s; samplesPerSecond = 6637.8
+05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70511987 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0377s; samplesPerSecond = 6637.8
-05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69754895 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0300s; samplesPerSecond = 8341.4
+05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69754895 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0300s; samplesPerSecond = 8341.4
-05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71056921 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0285s; samplesPerSecond = 8758.7
+05/03/2016 15:29:48:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71056921 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0285s; samplesPerSecond = 8758.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72951074 * 250; EvalClassificationError = 0.56000000 * 250; time = 0.0290s; samplesPerSecond = 8610.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72951074 * 250; EvalErrorPrediction = 0.56000000 * 250; time = 0.0290s; samplesPerSecond = 8610.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70946655 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0285s; samplesPerSecond = 8776.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70946655 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0285s; samplesPerSecond = 8776.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72656787 * 250; EvalClassificationError = 0.54400000 * 250; time = 0.0289s; samplesPerSecond = 8652.6
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72656787 * 250; EvalErrorPrediction = 0.54400000 * 250; time = 0.0289s; samplesPerSecond = 8652.6
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69337402 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0288s; samplesPerSecond = 8670.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69337402 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0288s; samplesPerSecond = 8670.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73605176 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0277s; samplesPerSecond = 9033.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73605176 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0277s; samplesPerSecond = 9033.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71453076 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0271s; samplesPerSecond = 9209.5
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71453076 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0271s; samplesPerSecond = 9209.5
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75191992 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0247s; samplesPerSecond = 10134.6
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75191992 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0247s; samplesPerSecond = 10134.6
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75975146 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0270s; samplesPerSecond = 9243.5
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75975146 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0270s; samplesPerSecond = 9243.5
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73172168 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0268s; samplesPerSecond = 9333.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73172168 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0268s; samplesPerSecond = 9333.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76840820 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0265s; samplesPerSecond = 9435.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76840820 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0265s; samplesPerSecond = 9435.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70464746 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0269s; samplesPerSecond = 9309.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70464746 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0269s; samplesPerSecond = 9309.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70557227 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0253s; samplesPerSecond = 9880.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70557227 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0253s; samplesPerSecond = 9880.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72711816 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0267s; samplesPerSecond = 9357.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72711816 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0267s; samplesPerSecond = 9357.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70076660 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0270s; samplesPerSecond = 9264.1
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70076660 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0270s; samplesPerSecond = 9264.1
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69409766 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0257s; samplesPerSecond = 9716.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69409766 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0257s; samplesPerSecond = 9716.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69139941 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0257s; samplesPerSecond = 9742.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69139941 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0257s; samplesPerSecond = 9742.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73361621 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0295s; samplesPerSecond = 8477.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73361621 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0295s; samplesPerSecond = 8477.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72225879 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0273s; samplesPerSecond = 9161.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72225879 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0273s; samplesPerSecond = 9161.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70356348 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0261s; samplesPerSecond = 9562.8
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70356348 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0261s; samplesPerSecond = 9562.8
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69928613 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0254s; samplesPerSecond = 9848.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69928613 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0254s; samplesPerSecond = 9848.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72360938 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0252s; samplesPerSecond = 9924.6
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72360938 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0252s; samplesPerSecond = 9924.6
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69871875 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0262s; samplesPerSecond = 9530.7
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69871875 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0262s; samplesPerSecond = 9530.7
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69114844 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9720.1
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69114844 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9720.1
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68648047 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0273s; samplesPerSecond = 9161.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68648047 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0273s; samplesPerSecond = 9161.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69657227 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0270s; samplesPerSecond = 9259.9
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69657227 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0270s; samplesPerSecond = 9259.9
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71585547 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0264s; samplesPerSecond = 9486.2
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71585547 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0264s; samplesPerSecond = 9486.2
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69730664 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0261s; samplesPerSecond = 9595.1
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69730664 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0261s; samplesPerSecond = 9595.1
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70432422 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0244s; samplesPerSecond = 10248.8
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70432422 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0244s; samplesPerSecond = 10248.8
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.69991797 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0220s; samplesPerSecond = 11388.0
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.69991797 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0220s; samplesPerSecond = 11388.0
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.68696875 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0222s; samplesPerSecond = 11277.0
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.68696875 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0222s; samplesPerSecond = 11277.0
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.67331445 * 250; EvalClassificationError = 0.37200000 * 250; time = 0.0245s; samplesPerSecond = 10192.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.67331445 * 250; EvalErrorPrediction = 0.37200000 * 250; time = 0.0245s; samplesPerSecond = 10192.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65711328 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0240s; samplesPerSecond = 10429.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65711328 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0240s; samplesPerSecond = 10429.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64534375 * 250; EvalClassificationError = 0.44800000 * 250; time = 0.0243s; samplesPerSecond = 10305.0
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64534375 * 250; EvalErrorPrediction = 0.44800000 * 250; time = 0.0243s; samplesPerSecond = 10305.0
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.61021875 * 250; EvalClassificationError = 0.36400000 * 250; time = 0.0236s; samplesPerSecond = 10606.3
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.61021875 * 250; EvalErrorPrediction = 0.36400000 * 250; time = 0.0236s; samplesPerSecond = 10606.3
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.54191016 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0236s; samplesPerSecond = 10578.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.54191016 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0236s; samplesPerSecond = 10578.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.45624414 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0232s; samplesPerSecond = 10762.4
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.45624414 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0232s; samplesPerSecond = 10762.4
-05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37636133 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0235s; samplesPerSecond = 10623.8
+05/03/2016 15:29:49:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37636133 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0235s; samplesPerSecond = 10623.8
-05/03/2016 15:29:49: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68695688 * 10000; EvalClassificationError = 0.45550000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=1.06166s
+05/03/2016 15:29:49: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68695688 * 10000; EvalErrorPrediction = 0.45550000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=1.06166s
 05/03/2016 15:29:49: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn.1'
 05/03/2016 15:29:49: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:29:49: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:29:49:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.28780429 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0246s; samplesPerSecond = 10181.2
+05/03/2016 15:29:49:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.28780429 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0246s; samplesPerSecond = 10181.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.28222478 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0246s; samplesPerSecond = 10178.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.28222478 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0246s; samplesPerSecond = 10178.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.23589864 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0255s; samplesPerSecond = 9796.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.23589864 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0255s; samplesPerSecond = 9796.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.21209458 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0242s; samplesPerSecond = 10312.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.21209458 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0242s; samplesPerSecond = 10312.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.20285913 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10283.0
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.20285913 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10283.0
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.21300948 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0252s; samplesPerSecond = 9928.5
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.21300948 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0252s; samplesPerSecond = 9928.5
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.17835594 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0256s; samplesPerSecond = 9753.8
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.17835594 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0256s; samplesPerSecond = 9753.8
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.18830077 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0257s; samplesPerSecond = 9740.1
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.18830077 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0257s; samplesPerSecond = 9740.1
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.14198478 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0250s; samplesPerSecond = 10019.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.14198478 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0250s; samplesPerSecond = 10019.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.15895022 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0237s; samplesPerSecond = 10566.8
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.15895022 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0237s; samplesPerSecond = 10566.8
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.21062646 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0238s; samplesPerSecond = 10517.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.21062646 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0238s; samplesPerSecond = 10517.9
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.16081948 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0223s; samplesPerSecond = 11186.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.16081948 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0223s; samplesPerSecond = 11186.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15635713 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10700.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15635713 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10700.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13008516 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0239s; samplesPerSecond = 10453.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13008516 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0239s; samplesPerSecond = 10453.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16625347 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0234s; samplesPerSecond = 10674.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16625347 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0234s; samplesPerSecond = 10674.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15001793 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0245s; samplesPerSecond = 10223.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15001793 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0245s; samplesPerSecond = 10223.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22343917 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0234s; samplesPerSecond = 10692.4
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22343917 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0234s; samplesPerSecond = 10692.4
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18006735 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0245s; samplesPerSecond = 10194.5
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18006735 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0245s; samplesPerSecond = 10194.5
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15361620 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0235s; samplesPerSecond = 10636.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15361620 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0235s; samplesPerSecond = 10636.9
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17039588 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0246s; samplesPerSecond = 10177.1
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17039588 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0246s; samplesPerSecond = 10177.1
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15516786 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0237s; samplesPerSecond = 10544.1
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15516786 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0237s; samplesPerSecond = 10544.1
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15969617 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0225s; samplesPerSecond = 11102.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15969617 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0225s; samplesPerSecond = 11102.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15939439 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10697.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15939439 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10697.9
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15300194 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0233s; samplesPerSecond = 10729.2
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15300194 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0233s; samplesPerSecond = 10729.2
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14902476 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0231s; samplesPerSecond = 10811.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14902476 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0231s; samplesPerSecond = 10811.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15043256 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0231s; samplesPerSecond = 10823.4
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15043256 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0231s; samplesPerSecond = 10823.4
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15531360 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0229s; samplesPerSecond = 10936.1
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15531360 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0229s; samplesPerSecond = 10936.1
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17990796 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0248s; samplesPerSecond = 10088.4
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17990796 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0248s; samplesPerSecond = 10088.4
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22925668 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0229s; samplesPerSecond = 10913.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22925668 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0229s; samplesPerSecond = 10913.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16843626 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0234s; samplesPerSecond = 10682.8
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16843626 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0234s; samplesPerSecond = 10682.8
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18045325 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0236s; samplesPerSecond = 10585.6
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18045325 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0236s; samplesPerSecond = 10585.6
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13337526 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0221s; samplesPerSecond = 11308.6
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13337526 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0221s; samplesPerSecond = 11308.6
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14332977 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0245s; samplesPerSecond = 10219.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14332977 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0245s; samplesPerSecond = 10219.9
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18749446 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0242s; samplesPerSecond = 10326.7
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18749446 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0242s; samplesPerSecond = 10326.7
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15505967 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0236s; samplesPerSecond = 10587.8
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15505967 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0236s; samplesPerSecond = 10587.8
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.19616616 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0228s; samplesPerSecond = 10980.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.19616616 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0228s; samplesPerSecond = 10980.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17305907 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0236s; samplesPerSecond = 10610.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17305907 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0236s; samplesPerSecond = 10610.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15197365 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0249s; samplesPerSecond = 10033.3
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15197365 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0249s; samplesPerSecond = 10033.3
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12102416 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0238s; samplesPerSecond = 10483.5
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12102416 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0238s; samplesPerSecond = 10483.5
-05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15278496 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0235s; samplesPerSecond = 10646.9
+05/03/2016 15:29:50:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15278496 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0235s; samplesPerSecond = 10646.9
-05/03/2016 15:29:50: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.17643784 * 10000; EvalClassificationError = 0.07560000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.957696s
+05/03/2016 15:29:50: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.17643784 * 10000; EvalErrorPrediction = 0.07560000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.957696s
 05/03/2016 15:29:50: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn.2'
 05/03/2016 15:29:50: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:29:50: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:29:50:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10623312 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10637.4
+05/03/2016 15:29:50:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10623312 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10637.4
-05/03/2016 15:29:50:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17519442 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0236s; samplesPerSecond = 10608.5
+05/03/2016 15:29:50:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17519442 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0236s; samplesPerSecond = 10608.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14133983 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0240s; samplesPerSecond = 10404.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14133983 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0240s; samplesPerSecond = 10404.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16278491 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0233s; samplesPerSecond = 10749.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16278491 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0233s; samplesPerSecond = 10749.0
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11783558 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0232s; samplesPerSecond = 10780.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11783558 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0232s; samplesPerSecond = 10780.0
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16342188 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0243s; samplesPerSecond = 10305.9
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16342188 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0243s; samplesPerSecond = 10305.9
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16272195 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0239s; samplesPerSecond = 10476.9
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16272195 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0239s; samplesPerSecond = 10476.9
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19401477 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0241s; samplesPerSecond = 10370.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19401477 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0241s; samplesPerSecond = 10370.0
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20186661 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0229s; samplesPerSecond = 10903.2
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20186661 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0229s; samplesPerSecond = 10903.2
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13672539 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10631.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13672539 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10631.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20069212 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0234s; samplesPerSecond = 10681.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20069212 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0234s; samplesPerSecond = 10681.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17729039 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9928.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17729039 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9928.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15906107 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0251s; samplesPerSecond = 9941.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15906107 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0251s; samplesPerSecond = 9941.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16281632 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0247s; samplesPerSecond = 10121.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16281632 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0247s; samplesPerSecond = 10121.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19834981 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0248s; samplesPerSecond = 10067.7
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19834981 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0248s; samplesPerSecond = 10067.7
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10217642 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0247s; samplesPerSecond = 10105.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10217642 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0247s; samplesPerSecond = 10105.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17011383 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0258s; samplesPerSecond = 9692.2
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17011383 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0258s; samplesPerSecond = 9692.2
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16599137 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9911.6
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16599137 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9911.6
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12648996 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0254s; samplesPerSecond = 9848.7
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12648996 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0254s; samplesPerSecond = 9848.7
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11920298 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0248s; samplesPerSecond = 10091.2
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11920298 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0248s; samplesPerSecond = 10091.2
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12883164 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0272s; samplesPerSecond = 9205.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12883164 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0272s; samplesPerSecond = 9205.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18222479 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0250s; samplesPerSecond = 9988.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18222479 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0250s; samplesPerSecond = 9988.0
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13443351 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0246s; samplesPerSecond = 10149.4
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13443351 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0246s; samplesPerSecond = 10149.4
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19720325 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0244s; samplesPerSecond = 10230.8
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19720325 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0244s; samplesPerSecond = 10230.8
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15586137 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0254s; samplesPerSecond = 9860.4
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15586137 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0254s; samplesPerSecond = 9860.4
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11854887 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0250s; samplesPerSecond = 9991.6
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11854887 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0250s; samplesPerSecond = 9991.6
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13705285 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0249s; samplesPerSecond = 10050.7
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13705285 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0249s; samplesPerSecond = 10050.7
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20009941 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0240s; samplesPerSecond = 10411.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20009941 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0240s; samplesPerSecond = 10411.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.19078680 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0233s; samplesPerSecond = 10741.6
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.19078680 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0233s; samplesPerSecond = 10741.6
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16505705 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0238s; samplesPerSecond = 10507.7
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16505705 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0238s; samplesPerSecond = 10507.7
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.12232722 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0239s; samplesPerSecond = 10472.1
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.12232722 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0239s; samplesPerSecond = 10472.1
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16342047 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0238s; samplesPerSecond = 10514.4
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16342047 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0238s; samplesPerSecond = 10514.4
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.15875107 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10688.3
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.15875107 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10688.3
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12248772 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0232s; samplesPerSecond = 10793.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12248772 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0232s; samplesPerSecond = 10793.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13457009 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0238s; samplesPerSecond = 10521.4
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13457009 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0238s; samplesPerSecond = 10521.4
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20976565 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0238s; samplesPerSecond = 10494.9
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20976565 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0238s; samplesPerSecond = 10494.9
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16519102 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0230s; samplesPerSecond = 10862.5
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16519102 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0230s; samplesPerSecond = 10862.5
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14971420 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0247s; samplesPerSecond = 10106.3
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14971420 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0247s; samplesPerSecond = 10106.3
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16456633 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0230s; samplesPerSecond = 10858.2
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16456633 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0230s; samplesPerSecond = 10858.2
-05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16971407 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0239s; samplesPerSecond = 10473.0
+05/03/2016 15:29:51:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16971407 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0239s; samplesPerSecond = 10473.0
-05/03/2016 15:29:51: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15787325 * 10000; EvalClassificationError = 0.07430000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.972052s
+05/03/2016 15:29:51: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15787325 * 10000; EvalErrorPrediction = 0.07430000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.972052s
 05/03/2016 15:29:51: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_cpu/Models/multigpu.dnn'
 05/03/2016 15:29:51: CNTKCommandTrainEnd: Multigpu_Demo_Train
@ -621,7 +621,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -650,7 +650,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -674,7 +674,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 000000CDDFC7B490: {[W0 Value[50 x 2]] }
 000000CDDFC7B530: {[features Value[2 x *1]] }
 000000CDDFC7B710: {[W1 Value[50 x 50]] }
@ -690,7 +690,7 @@ Memory Sharing Structure:
 000000CDDFC8C2B0: {[W1*H1+B1 Value[50 x 1 x *1]] }
 000000CDDFC8C490: {[CrossEntropyWithSoftmax Value[1]] }
 000000CDDFC8C5D0: {[LogOfPrior Value[2]] }
-000000CDDFC8C670: {[EvalClassificationError Value[1]] }
+000000CDDFC8C670: {[EvalErrorPrediction Value[1]] }
 000000CDDFC8C990: {[MVNormalizedFeatures Value[2 x *1]] }
 000000CDDFC8CA30: {[H2 Value[50 x 1 x *1]] }
 000000CDDFC8CC10: {[W1*H1 Value[50 x 1 x *1]] }
@ -699,7 +699,7 @@ Memory Sharing Structure:
 000000CDDFC8D610: {[HLast Value[2 x 1 x *1]] }
 000000CDDFC8D750: {[W0*features+B0 Value[50 x 1 x *1]] }
-05/03/2016 15:29:52: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05306799 * 603; CrossEntropyWithSoftmax = 0.11782631 * 603; perplexity = 1.12504868
+05/03/2016 15:29:52: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05306799 * 603; CrossEntropyWithSoftmax = 0.11782631 * 603; perplexity = 1.12504868
 05/03/2016 15:29:52: Action "test" complete.
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/MultiGpu/baseline.windows.gpu.txt
@ -66,7 +66,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -167,7 +167,7 @@ Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -300,7 +300,7 @@ configparameters: Multigpu.cntk:Multigpu_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -369,7 +369,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -398,7 +398,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -422,14 +422,14 @@ Post-processing network complete.
 05/03/2016 15:29:53: Evaluation criterion node(s):
-05/03/2016 15:29:53: 	EvalClassificationError = ClassificationError
+05/03/2016 15:29:53: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+0000000000000000: {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 000000572B66ECA0: {[features Value[2 x *]] }
 00000057420A1700: {[W1 Value[50 x 50]] }
 00000057420A1980: {[MeanOfFeatures Value[2]] }
@ -448,7 +448,7 @@ Memory Sharing Structure:
 00000057439283E0: {[LogOfPrior Value[2]] }
 00000057439285C0: {[W0 Gradient[50 x 2]] [W0*features+B0 Value[50 x 1 x *]] }
 0000005743928660: {[B1 Gradient[50 x 1]] [H2 Gradient[50 x 1 x *]] [HLast Gradient[2 x 1 x *]] }
-00000057439287A0: {[EvalClassificationError Value[1]] }
+00000057439287A0: {[EvalErrorPrediction Value[1]] }
 0000005743928980: {[CrossEntropyWithSoftmax Value[1]] }
 0000005743928A20: {[B2 Gradient[2 x 1]] }
 0000005743928E80: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
@ -472,139 +472,139 @@ Memory Sharing Structure:
 05/03/2016 15:29:54: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:29:54: Starting minibatch loop.
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70650452 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0115s; samplesPerSecond = 21832.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70650452 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0115s; samplesPerSecond = 21832.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69701831 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0095s; samplesPerSecond = 26326.9
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69701831 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0095s; samplesPerSecond = 26326.9
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71089587 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0100s; samplesPerSecond = 25067.7
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71089587 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0100s; samplesPerSecond = 25067.7
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72980273 * 250; EvalClassificationError = 0.56000000 * 250; time = 0.0096s; samplesPerSecond = 26079.7
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72980273 * 250; EvalErrorPrediction = 0.56000000 * 250; time = 0.0096s; samplesPerSecond = 26079.7
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70902783 * 250; EvalClassificationError = 0.52800000 * 250; time = 0.0115s; samplesPerSecond = 21692.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70902783 * 250; EvalErrorPrediction = 0.52800000 * 250; time = 0.0115s; samplesPerSecond = 21692.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72657300 * 250; EvalClassificationError = 0.54400000 * 250; time = 0.0124s; samplesPerSecond = 20127.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72657300 * 250; EvalErrorPrediction = 0.54400000 * 250; time = 0.0124s; samplesPerSecond = 20127.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69319678 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0091s; samplesPerSecond = 27439.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69319678 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0091s; samplesPerSecond = 27439.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73563477 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0112s; samplesPerSecond = 22246.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73563477 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0112s; samplesPerSecond = 22246.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71463281 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0115s; samplesPerSecond = 21739.1
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71463281 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0115s; samplesPerSecond = 21739.1
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75213428 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0105s; samplesPerSecond = 23814.1
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75213428 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0105s; samplesPerSecond = 23814.1
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75931445 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0115s; samplesPerSecond = 21763.7
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75931445 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0115s; samplesPerSecond = 21763.7
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73075293 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0120s; samplesPerSecond = 20835.1
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73075293 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0120s; samplesPerSecond = 20835.1
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76701953 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0130s; samplesPerSecond = 19305.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76701953 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0130s; samplesPerSecond = 19305.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70451270 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0108s; samplesPerSecond = 23184.6
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70451270 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0108s; samplesPerSecond = 23184.6
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70539941 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0117s; samplesPerSecond = 21385.8
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70539941 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0117s; samplesPerSecond = 21385.8
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72700293 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0120s; samplesPerSecond = 20917.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72700293 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0120s; samplesPerSecond = 20917.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70096191 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0112s; samplesPerSecond = 22301.5
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70096191 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0112s; samplesPerSecond = 22301.5
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69437305 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0113s; samplesPerSecond = 22079.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69437305 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0113s; samplesPerSecond = 22079.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69161621 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0116s; samplesPerSecond = 21514.6
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69161621 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0116s; samplesPerSecond = 21514.6
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73388281 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0107s; samplesPerSecond = 23406.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73388281 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0107s; samplesPerSecond = 23406.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72255664 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0116s; samplesPerSecond = 21546.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72255664 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0116s; samplesPerSecond = 21546.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70414551 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0115s; samplesPerSecond = 21756.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70414551 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0115s; samplesPerSecond = 21756.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69976758 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0113s; samplesPerSecond = 22065.3
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69976758 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0113s; samplesPerSecond = 22065.3
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72419141 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0114s; samplesPerSecond = 22018.7
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72419141 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0114s; samplesPerSecond = 22018.7
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69943945 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0111s; samplesPerSecond = 22604.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69943945 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0111s; samplesPerSecond = 22604.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69206445 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0111s; samplesPerSecond = 22504.3
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69206445 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0111s; samplesPerSecond = 22504.3
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68771680 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0113s; samplesPerSecond = 22118.0
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68771680 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0113s; samplesPerSecond = 22118.0
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69878516 * 250; EvalClassificationError = 0.44000000 * 250; time = 0.0130s; samplesPerSecond = 19278.2
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69878516 * 250; EvalErrorPrediction = 0.44000000 * 250; time = 0.0130s; samplesPerSecond = 19278.2
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71889844 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0127s; samplesPerSecond = 19632.5
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71889844 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0127s; samplesPerSecond = 19632.5
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.70086523 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0095s; samplesPerSecond = 26329.6
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.70086523 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0095s; samplesPerSecond = 26329.6
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70878320 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0112s; samplesPerSecond = 22361.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70878320 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0112s; samplesPerSecond = 22361.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.70674414 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0130s; samplesPerSecond = 19168.8
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.70674414 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0130s; samplesPerSecond = 19168.8
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69707422 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0094s; samplesPerSecond = 26729.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69707422 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0094s; samplesPerSecond = 26729.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68588281 * 250; EvalClassificationError = 0.40800000 * 250; time = 0.0112s; samplesPerSecond = 22365.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68588281 * 250; EvalErrorPrediction = 0.40800000 * 250; time = 0.0112s; samplesPerSecond = 22365.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.67734766 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0128s; samplesPerSecond = 19583.3
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.67734766 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0128s; samplesPerSecond = 19583.3
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.67958008 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0092s; samplesPerSecond = 27144.4
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.67958008 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0092s; samplesPerSecond = 27144.4
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.66424805 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0114s; samplesPerSecond = 21864.6
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.66424805 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0114s; samplesPerSecond = 21864.6
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.62412500 * 250; EvalClassificationError = 0.20400000 * 250; time = 0.0116s; samplesPerSecond = 21475.8
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.62412500 * 250; EvalErrorPrediction = 0.20400000 * 250; time = 0.0116s; samplesPerSecond = 21475.8
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.58007422 * 250; EvalClassificationError = 0.16000000 * 250; time = 0.0094s; samplesPerSecond = 26567.5
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.58007422 * 250; EvalErrorPrediction = 0.16000000 * 250; time = 0.0094s; samplesPerSecond = 26567.5
-05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.52764648 * 250; EvalClassificationError = 0.19200000 * 250; time = 0.0132s; samplesPerSecond = 18988.3
+05/03/2016 15:29:54:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.52764648 * 250; EvalErrorPrediction = 0.19200000 * 250; time = 0.0132s; samplesPerSecond = 18988.3
-05/03/2016 15:29:54: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.69975483 * 10000; EvalClassificationError = 0.46850000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.453807s
+05/03/2016 15:29:54: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.69975483 * 10000; EvalErrorPrediction = 0.46850000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.453807s
 05/03/2016 15:29:54: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn.1'
 05/03/2016 15:29:54: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:29:54: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.45075654 * 250; EvalClassificationError = 0.15200000 * 250; time = 0.0250s; samplesPerSecond = 10002.4
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.45075654 * 250; EvalErrorPrediction = 0.15200000 * 250; time = 0.0250s; samplesPerSecond = 10002.4
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.40775497 * 250; EvalClassificationError = 0.14400000 * 250; time = 0.0219s; samplesPerSecond = 11420.2
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.40775497 * 250; EvalErrorPrediction = 0.14400000 * 250; time = 0.0219s; samplesPerSecond = 11420.2
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.34165228 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0230s; samplesPerSecond = 10859.6
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.34165228 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0230s; samplesPerSecond = 10859.6
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.29708900 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0198s; samplesPerSecond = 12604.0
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.29708900 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0198s; samplesPerSecond = 12604.0
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.26669365 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0211s; samplesPerSecond = 11860.7
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.26669365 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0211s; samplesPerSecond = 11860.7
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.25328680 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0212s; samplesPerSecond = 11817.0
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.25328680 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0212s; samplesPerSecond = 11817.0
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.21017820 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0237s; samplesPerSecond = 10540.1
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.21017820 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0237s; samplesPerSecond = 10540.1
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.21483054 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0214s; samplesPerSecond = 11699.7
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.21483054 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0214s; samplesPerSecond = 11699.7
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.16626513 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0213s; samplesPerSecond = 11757.5
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.16626513 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0213s; samplesPerSecond = 11757.5
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.17672434 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0239s; samplesPerSecond = 10454.6
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.17672434 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0239s; samplesPerSecond = 10454.6
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.22140190 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0208s; samplesPerSecond = 12033.1
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.22140190 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0208s; samplesPerSecond = 12033.1
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17048554 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0237s; samplesPerSecond = 10553.4
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17048554 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0237s; samplesPerSecond = 10553.4
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16438517 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10662.3
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16438517 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10662.3
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13782141 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0218s; samplesPerSecond = 11449.0
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13782141 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0218s; samplesPerSecond = 11449.0
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16909663 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0244s; samplesPerSecond = 10228.7
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16909663 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0244s; samplesPerSecond = 10228.7
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15419129 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0229s; samplesPerSecond = 10924.7
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15419129 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0229s; samplesPerSecond = 10924.7
-05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22229924 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0242s; samplesPerSecond = 10340.4
+05/03/2016 15:29:54:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22229924 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0242s; samplesPerSecond = 10340.4
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18134995 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0236s; samplesPerSecond = 10579.3
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18134995 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0236s; samplesPerSecond = 10579.3
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15616904 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0236s; samplesPerSecond = 10594.6
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15616904 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0236s; samplesPerSecond = 10594.6
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17162733 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0262s; samplesPerSecond = 9530.3
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17162733 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0262s; samplesPerSecond = 9530.3
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15676289 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0262s; samplesPerSecond = 9554.4
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15676289 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0262s; samplesPerSecond = 9554.4
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16159542 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0262s; samplesPerSecond = 9558.8
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16159542 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0262s; samplesPerSecond = 9558.8
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.16102246 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0284s; samplesPerSecond = 8800.3
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.16102246 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0284s; samplesPerSecond = 8800.3
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15392923 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0248s; samplesPerSecond = 10089.6
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15392923 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0248s; samplesPerSecond = 10089.6
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14898334 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0269s; samplesPerSecond = 9279.5
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14898334 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0269s; samplesPerSecond = 9279.5
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15087969 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0285s; samplesPerSecond = 8785.2
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15087969 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0285s; samplesPerSecond = 8785.2
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15494578 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0247s; samplesPerSecond = 10101.4
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15494578 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0247s; samplesPerSecond = 10101.4
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17878713 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0250s; samplesPerSecond = 9986.0
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17878713 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0250s; samplesPerSecond = 9986.0
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22845049 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0249s; samplesPerSecond = 10045.4
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22845049 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0249s; samplesPerSecond = 10045.4
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16884430 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0241s; samplesPerSecond = 10376.5
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16884430 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0241s; samplesPerSecond = 10376.5
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17970282 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0237s; samplesPerSecond = 10533.9
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17970282 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0237s; samplesPerSecond = 10533.9
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13292468 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0257s; samplesPerSecond = 9721.6
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13292468 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0257s; samplesPerSecond = 9721.6
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14167778 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0226s; samplesPerSecond = 11048.3
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14167778 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0226s; samplesPerSecond = 11048.3
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18716852 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0237s; samplesPerSecond = 10534.7
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18716852 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0237s; samplesPerSecond = 10534.7
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15480385 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0258s; samplesPerSecond = 9705.0
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15480385 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0258s; samplesPerSecond = 9705.0
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.19482328 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0247s; samplesPerSecond = 10115.7
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.19482328 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0247s; samplesPerSecond = 10115.7
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17488171 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0249s; samplesPerSecond = 10048.2
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17488171 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0249s; samplesPerSecond = 10048.2
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15164433 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0281s; samplesPerSecond = 8901.2
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15164433 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0281s; samplesPerSecond = 8901.2
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12142463 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0222s; samplesPerSecond = 11279.0
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12142463 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0222s; samplesPerSecond = 11279.0
-05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15287631 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0238s; samplesPerSecond = 10489.7
+05/03/2016 15:29:55:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15287631 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0238s; samplesPerSecond = 10489.7
-05/03/2016 15:29:55: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.19475469 * 10000; EvalClassificationError = 0.07830000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.964496s
+05/03/2016 15:29:55: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.19475469 * 10000; EvalErrorPrediction = 0.07830000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.964496s
 05/03/2016 15:29:55: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn.2'
 05/03/2016 15:29:55: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:29:55: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1).
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10717578 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0253s; samplesPerSecond = 9869.7
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10717578 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0253s; samplesPerSecond = 9869.7
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17521929 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0234s; samplesPerSecond = 10701.1
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17521929 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0234s; samplesPerSecond = 10701.1
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14088211 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0250s; samplesPerSecond = 9986.8
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14088211 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0250s; samplesPerSecond = 9986.8
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16281337 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10287.6
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16281337 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10287.6
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11778386 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0234s; samplesPerSecond = 10666.9
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11778386 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0234s; samplesPerSecond = 10666.9
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16295400 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0266s; samplesPerSecond = 9385.8
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16295400 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0266s; samplesPerSecond = 9385.8
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16287201 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0233s; samplesPerSecond = 10746.2
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16287201 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0233s; samplesPerSecond = 10746.2
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19482140 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0242s; samplesPerSecond = 10312.3
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19482140 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0242s; samplesPerSecond = 10312.3
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20113689 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0235s; samplesPerSecond = 10643.3
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20113689 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0235s; samplesPerSecond = 10643.3
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13748570 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0238s; samplesPerSecond = 10484.4
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13748570 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0238s; samplesPerSecond = 10484.4
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20080420 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0236s; samplesPerSecond = 10600.9
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20080420 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0236s; samplesPerSecond = 10600.9
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17730590 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0268s; samplesPerSecond = 9342.3
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17730590 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0268s; samplesPerSecond = 9342.3
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15851029 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0233s; samplesPerSecond = 10743.0
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15851029 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0233s; samplesPerSecond = 10743.0
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16257260 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0250s; samplesPerSecond = 10012.8
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16257260 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0250s; samplesPerSecond = 10012.8
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19772537 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0224s; samplesPerSecond = 11143.3
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19772537 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0224s; samplesPerSecond = 11143.3
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10259204 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0235s; samplesPerSecond = 10626.1
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10259204 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0235s; samplesPerSecond = 10626.1
-05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17093073 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0244s; samplesPerSecond = 10230.0
+05/03/2016 15:29:55:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17093073 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0244s; samplesPerSecond = 10230.0
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16628544 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9936.8
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16628544 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0252s; samplesPerSecond = 9936.8
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12690716 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0246s; samplesPerSecond = 10171.7
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12690716 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0246s; samplesPerSecond = 10171.7
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11894288 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0233s; samplesPerSecond = 10718.1
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11894288 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0233s; samplesPerSecond = 10718.1
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12815907 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0246s; samplesPerSecond = 10151.0
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12815907 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0246s; samplesPerSecond = 10151.0
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18265773 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0225s; samplesPerSecond = 11131.9
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18265773 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0225s; samplesPerSecond = 11131.9
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13388730 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0231s; samplesPerSecond = 10807.5
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13388730 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0231s; samplesPerSecond = 10807.5
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19787903 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0251s; samplesPerSecond = 9951.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19787903 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0251s; samplesPerSecond = 9951.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15563315 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0241s; samplesPerSecond = 10373.0
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15563315 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0241s; samplesPerSecond = 10373.0
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11837055 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0240s; samplesPerSecond = 10429.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11837055 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0240s; samplesPerSecond = 10429.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13732942 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10689.7
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13732942 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0234s; samplesPerSecond = 10689.7
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20012115 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0253s; samplesPerSecond = 9872.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20012115 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0253s; samplesPerSecond = 9872.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.19086846 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0238s; samplesPerSecond = 10525.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.19086846 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0238s; samplesPerSecond = 10525.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16492589 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10272.8
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16492589 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0243s; samplesPerSecond = 10272.8
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.12141157 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0238s; samplesPerSecond = 10509.5
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.12141157 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0238s; samplesPerSecond = 10509.5
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16335481 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0236s; samplesPerSecond = 10579.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16335481 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0236s; samplesPerSecond = 10579.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.15923900 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0241s; samplesPerSecond = 10358.0
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.15923900 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0241s; samplesPerSecond = 10358.0
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12315803 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10617.1
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12315803 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0235s; samplesPerSecond = 10617.1
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13481532 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0260s; samplesPerSecond = 9612.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13481532 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0260s; samplesPerSecond = 9612.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20958008 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0223s; samplesPerSecond = 11232.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20958008 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0223s; samplesPerSecond = 11232.4
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16519713 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0255s; samplesPerSecond = 9814.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16519713 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0255s; samplesPerSecond = 9814.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14990733 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0239s; samplesPerSecond = 10481.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14990733 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0239s; samplesPerSecond = 10481.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16508552 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0255s; samplesPerSecond = 9789.3
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16508552 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0255s; samplesPerSecond = 9789.3
-05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16941540 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0240s; samplesPerSecond = 10435.4
+05/03/2016 15:29:56:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16941540 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0240s; samplesPerSecond = 10435.4
-05/03/2016 15:29:56: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15791792 * 10000; EvalClassificationError = 0.07460000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.970059s
+05/03/2016 15:29:56: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15791792 * 10000; EvalErrorPrediction = 0.07460000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.970059s
 05/03/2016 15:29:56: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503162947.903093\CNTKTextFormatReader\Examples\Other\Simple2d_MultiGpu@release_gpu/Models/multigpu.dnn'
 05/03/2016 15:29:56: CNTKCommandTrainEnd: Multigpu_Demo_Train
@ -622,7 +622,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -651,7 +651,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -675,7 +675,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0000005743925BB0: {[HLast Value[2 x 1 x *1]] }
 0000005743925D90: {[MVNormalizedFeatures Value[2 x *1]] }
 0000005743925E30: {[CrossEntropyWithSoftmax Value[1]] }
@ -688,7 +688,7 @@ Memory Sharing Structure:
 00000057439265B0: {[W0*features+B0 Value[50 x 1 x *1]] }
 0000005743926650: {[W1*H1 Value[50 x 1 x *1]] }
 0000005743926970: {[H2 Value[50 x 1 x *1]] }
-0000005743926AB0: {[EvalClassificationError Value[1]] }
+0000005743926AB0: {[EvalErrorPrediction Value[1]] }
 000000574B7FAD10: {[W0 Value[50 x 2]] }
 000000574B7FB170: {[InvStdOfFeatures Value[2]] }
 000000574B7FB210: {[MeanOfFeatures Value[2]] }
@ -700,7 +700,7 @@ Memory Sharing Structure:
 000000574D960E50: {[B2 Value[2 x 1]] }
 000000574D9610D0: {[B0 Value[50 x 1]] }
-05/03/2016 15:29:56: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12022919 * 603; perplexity = 1.12775529
+05/03/2016 15:29:56: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12022919 * 603; perplexity = 1.12775529
 05/03/2016 15:29:56: Action "test" complete.
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.linux.cpu.txt
@ -58,7 +58,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -157,7 +157,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -300,7 +300,7 @@ configparameters: Simple.cntk:Simple_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -355,7 +355,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -384,7 +384,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -408,14 +408,14 @@ Post-processing network complete.
 05/03/2016 15:21:15: Evaluation criterion node(s):
-05/03/2016 15:21:15: 	EvalClassificationError = ClassificationError
+05/03/2016 15:21:15: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+(nil): {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 0x2e7f338: {[features Value[2 x *]] }
 0x2e82908: {[MeanOfFeatures Value[2]] }
 0x2e84f08: {[InvStdOfFeatures Value[2]] }
@ -427,7 +427,7 @@ Memory Sharing Structure:
 0x2e8b718: {[B2 Value[2 x 1]] }
 0x2e8c1e8: {[labels Value[2 x *]] }
 0x2e8cf38: {[Prior Value[2]] }
-0x2e926f8: {[EvalClassificationError Value[1]] }
+0x2e926f8: {[EvalErrorPrediction Value[1]] }
 0x2e92858: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 0x2e929b8: {[CrossEntropyWithSoftmax Value[1]] }
 0x2e93218: {[LogOfPrior Value[2]] }
@ -458,139 +458,139 @@ Memory Sharing Structure:
 05/03/2016 15:21:17: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:17: Starting minibatch loop.
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.69966235 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0806s; samplesPerSecond = 3103.4
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.69966235 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0806s; samplesPerSecond = 3103.4
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70639648 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0489s; samplesPerSecond = 5107.5
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70639648 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0489s; samplesPerSecond = 5107.5
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70470264 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0598s; samplesPerSecond = 4180.0
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70470264 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0598s; samplesPerSecond = 4180.0
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69813501 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0581s; samplesPerSecond = 4306.3
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69813501 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0581s; samplesPerSecond = 4306.3
-05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73551416 * 250; EvalClassificationError = 0.57600000 * 250; time = 0.0618s; samplesPerSecond = 4045.4
+05/03/2016 15:21:17:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73551416 * 250; EvalErrorPrediction = 0.57600000 * 250; time = 0.0618s; samplesPerSecond = 4045.4
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72432324 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0579s; samplesPerSecond = 4314.7
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72432324 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0579s; samplesPerSecond = 4314.7
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73327588 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.2699s; samplesPerSecond = 926.3
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73327588 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.2699s; samplesPerSecond = 926.3
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70092627 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0620s; samplesPerSecond = 4035.0
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70092627 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0620s; samplesPerSecond = 4035.0
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72354980 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0826s; samplesPerSecond = 3027.2
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72354980 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0826s; samplesPerSecond = 3027.2
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72148096 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0811s; samplesPerSecond = 3082.2
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72148096 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0811s; samplesPerSecond = 3082.2
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69814941 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0895s; samplesPerSecond = 2793.1
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69814941 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0895s; samplesPerSecond = 2793.1
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70699121 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0482s; samplesPerSecond = 5187.9
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70699121 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0482s; samplesPerSecond = 5187.9
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69898437 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0567s; samplesPerSecond = 4408.3
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69898437 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0567s; samplesPerSecond = 4408.3
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71712695 * 250; EvalClassificationError = 0.54000000 * 250; time = 0.0586s; samplesPerSecond = 4266.7
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71712695 * 250; EvalErrorPrediction = 0.54000000 * 250; time = 0.0586s; samplesPerSecond = 4266.7
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69470703 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0546s; samplesPerSecond = 4575.3
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69470703 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0546s; samplesPerSecond = 4575.3
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71375879 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0640s; samplesPerSecond = 3907.4
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71375879 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0640s; samplesPerSecond = 3907.4
-05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70381641 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0756s; samplesPerSecond = 3307.9
+05/03/2016 15:21:18:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70381641 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0756s; samplesPerSecond = 3307.9
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71748633 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0598s; samplesPerSecond = 4178.1
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71748633 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0598s; samplesPerSecond = 4178.1
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71863281 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0813s; samplesPerSecond = 3075.3
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71863281 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0813s; samplesPerSecond = 3075.3
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70715234 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0811s; samplesPerSecond = 3082.9
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70715234 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0811s; samplesPerSecond = 3082.9
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70401074 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0673s; samplesPerSecond = 3717.1
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70401074 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0673s; samplesPerSecond = 3717.1
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70599414 * 250; EvalClassificationError = 0.48400000 * 250; time = 0.0819s; samplesPerSecond = 3052.5
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70599414 * 250; EvalErrorPrediction = 0.48400000 * 250; time = 0.0819s; samplesPerSecond = 3052.5
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69628711 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0909s; samplesPerSecond = 2749.3
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69628711 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0909s; samplesPerSecond = 2749.3
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75920898 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0752s; samplesPerSecond = 3323.1
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75920898 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0752s; samplesPerSecond = 3323.1
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70542578 * 250; EvalClassificationError = 0.43600000 * 250; time = 0.0734s; samplesPerSecond = 3406.2
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70542578 * 250; EvalErrorPrediction = 0.43600000 * 250; time = 0.0734s; samplesPerSecond = 3406.2
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70643945 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0869s; samplesPerSecond = 2875.4
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70643945 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0869s; samplesPerSecond = 2875.4
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72481641 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0893s; samplesPerSecond = 2798.7
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72481641 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0893s; samplesPerSecond = 2798.7
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71133594 * 250; EvalClassificationError = 0.55600000 * 250; time = 0.0814s; samplesPerSecond = 3072.2
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71133594 * 250; EvalErrorPrediction = 0.55600000 * 250; time = 0.0814s; samplesPerSecond = 3072.2
-05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68605664 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0812s; samplesPerSecond = 3077.4
+05/03/2016 15:21:19:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68605664 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0812s; samplesPerSecond = 3077.4
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69535352 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0895s; samplesPerSecond = 2792.1
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69535352 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0895s; samplesPerSecond = 2792.1
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.68741797 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0831s; samplesPerSecond = 3008.7
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.68741797 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0831s; samplesPerSecond = 3008.7
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.67916406 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0818s; samplesPerSecond = 3056.5
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.67916406 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0818s; samplesPerSecond = 3056.5
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.67841992 * 250; EvalClassificationError = 0.44800000 * 250; time = 0.2681s; samplesPerSecond = 932.5
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.67841992 * 250; EvalErrorPrediction = 0.44800000 * 250; time = 0.2681s; samplesPerSecond = 932.5
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68038477 * 250; EvalClassificationError = 0.49200000 * 250; time = 0.0513s; samplesPerSecond = 4869.4
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68038477 * 250; EvalErrorPrediction = 0.49200000 * 250; time = 0.0513s; samplesPerSecond = 4869.4
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.61937109 * 250; EvalClassificationError = 0.30400000 * 250; time = 0.0680s; samplesPerSecond = 3678.3
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.61937109 * 250; EvalErrorPrediction = 0.30400000 * 250; time = 0.0680s; samplesPerSecond = 3678.3
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.57844141 * 250; EvalClassificationError = 0.27200000 * 250; time = 0.0758s; samplesPerSecond = 3296.3
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.57844141 * 250; EvalErrorPrediction = 0.27200000 * 250; time = 0.0758s; samplesPerSecond = 3296.3
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.49124023 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0664s; samplesPerSecond = 3763.4
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.49124023 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0664s; samplesPerSecond = 3763.4
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.39071289 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0505s; samplesPerSecond = 4955.3
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.39071289 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0505s; samplesPerSecond = 4955.3
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.27650586 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0515s; samplesPerSecond = 4855.7
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.27650586 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0515s; samplesPerSecond = 4855.7
-05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.26430078 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0517s; samplesPerSecond = 4834.4
+05/03/2016 15:21:20:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.26430078 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0517s; samplesPerSecond = 4834.4
-05/03/2016 15:21:20: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.66664150 * 10000; EvalClassificationError = 0.44430000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=3.21314s
+05/03/2016 15:21:20: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.66664150 * 10000; EvalErrorPrediction = 0.44430000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=3.21314s
 05/03/2016 15:21:20: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_cpu/Models/simple.dnn.1'
 05/03/2016 15:21:20: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:20: Starting minibatch loop.
-05/03/2016 15:21:20:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.20732678 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0782s; samplesPerSecond = 3196.0
+05/03/2016 15:21:20:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.20732678 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0782s; samplesPerSecond = 3196.0
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.19684015 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0812s; samplesPerSecond = 3079.4
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.19684015 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0812s; samplesPerSecond = 3079.4
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.16083588 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0796s; samplesPerSecond = 3141.3
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.16083588 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0796s; samplesPerSecond = 3141.3
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13558752 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0811s; samplesPerSecond = 3083.5
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.13558752 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0811s; samplesPerSecond = 3083.5
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17992950 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0814s; samplesPerSecond = 3070.9
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17992950 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0814s; samplesPerSecond = 3070.9
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17858063 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0812s; samplesPerSecond = 3079.3
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17858063 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0812s; samplesPerSecond = 3079.3
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16847546 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0688s; samplesPerSecond = 3631.6
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16847546 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0688s; samplesPerSecond = 3631.6
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16359399 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0547s; samplesPerSecond = 4572.7
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16359399 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0547s; samplesPerSecond = 4572.7
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19534705 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0521s; samplesPerSecond = 4796.2
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19534705 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0521s; samplesPerSecond = 4796.2
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19363660 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0758s; samplesPerSecond = 3297.5
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19363660 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0758s; samplesPerSecond = 3297.5
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.12703638 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0682s; samplesPerSecond = 3667.7
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.12703638 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0682s; samplesPerSecond = 3667.7
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.18622827 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0576s; samplesPerSecond = 4344.0
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.18622827 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0576s; samplesPerSecond = 4344.0
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.11595044 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0599s; samplesPerSecond = 4171.2
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.11595044 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0599s; samplesPerSecond = 4171.2
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16689380 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0650s; samplesPerSecond = 3845.2
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16689380 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0650s; samplesPerSecond = 3845.2
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.15822559 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0631s; samplesPerSecond = 3964.2
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.15822559 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0631s; samplesPerSecond = 3964.2
-05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18381909 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0638s; samplesPerSecond = 3920.5
+05/03/2016 15:21:21:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18381909 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0638s; samplesPerSecond = 3920.5
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18274048 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0642s; samplesPerSecond = 3893.2
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18274048 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0642s; samplesPerSecond = 3893.2
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18638428 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0564s; samplesPerSecond = 4431.5
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18638428 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0564s; samplesPerSecond = 4431.5
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20111572 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0528s; samplesPerSecond = 4733.8
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20111572 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0528s; samplesPerSecond = 4733.8
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13185034 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0504s; samplesPerSecond = 4962.1
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13185034 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0504s; samplesPerSecond = 4962.1
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13692554 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0559s; samplesPerSecond = 4468.8
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13692554 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0559s; samplesPerSecond = 4468.8
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15396802 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0672s; samplesPerSecond = 3719.4
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15396802 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0672s; samplesPerSecond = 3719.4
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15347241 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0818s; samplesPerSecond = 3057.6
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15347241 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0818s; samplesPerSecond = 3057.6
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14583887 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.2662s; samplesPerSecond = 939.1
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14583887 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.2662s; samplesPerSecond = 939.1
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12333276 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0738s; samplesPerSecond = 3389.0
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12333276 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0738s; samplesPerSecond = 3389.0
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.13958154 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0778s; samplesPerSecond = 3211.3
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.13958154 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0778s; samplesPerSecond = 3211.3
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12539844 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0772s; samplesPerSecond = 3239.1
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12539844 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0772s; samplesPerSecond = 3239.1
-05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.19014404 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0475s; samplesPerSecond = 5259.1
+05/03/2016 15:21:22:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.19014404 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0475s; samplesPerSecond = 5259.1
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17959521 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0780s; samplesPerSecond = 3206.4
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17959521 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0780s; samplesPerSecond = 3206.4
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18899121 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0469s; samplesPerSecond = 5333.6
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18899121 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0469s; samplesPerSecond = 5333.6
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17525586 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0625s; samplesPerSecond = 4003.1
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17525586 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0625s; samplesPerSecond = 4003.1
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14735645 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0940s; samplesPerSecond = 2658.9
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14735645 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0940s; samplesPerSecond = 2658.9
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13705518 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0543s; samplesPerSecond = 4600.2
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13705518 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0543s; samplesPerSecond = 4600.2
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.13610693 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0752s; samplesPerSecond = 3324.2
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.13610693 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0752s; samplesPerSecond = 3324.2
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13555811 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0583s; samplesPerSecond = 4291.1
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13555811 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0583s; samplesPerSecond = 4291.1
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.14883594 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0598s; samplesPerSecond = 4180.7
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.14883594 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0598s; samplesPerSecond = 4180.7
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14724707 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0599s; samplesPerSecond = 4172.4
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14724707 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0599s; samplesPerSecond = 4172.4
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13130469 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0664s; samplesPerSecond = 3764.2
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13130469 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0664s; samplesPerSecond = 3764.2
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19636084 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0644s; samplesPerSecond = 3884.1
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19636084 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0644s; samplesPerSecond = 3884.1
-05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15681836 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0651s; samplesPerSecond = 3841.0
+05/03/2016 15:21:23:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15681836 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0651s; samplesPerSecond = 3841.0
-05/03/2016 15:21:23: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16173864 * 10000; EvalClassificationError = 0.07520000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=2.87283s
+05/03/2016 15:21:23: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16173864 * 10000; EvalErrorPrediction = 0.07520000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=2.87283s
 05/03/2016 15:21:23: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_cpu/Models/simple.dnn.2'
 05/03/2016 15:21:23: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:23: Starting minibatch loop.
-05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18214960 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0604s; samplesPerSecond = 4138.7
+05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18214960 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0604s; samplesPerSecond = 4138.7
-05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13526825 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0622s; samplesPerSecond = 4020.6
+05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13526825 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0622s; samplesPerSecond = 4020.6
-05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14344995 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0640s; samplesPerSecond = 3906.0
+05/03/2016 15:21:23:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14344995 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0640s; samplesPerSecond = 3906.0
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.12557471 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0628s; samplesPerSecond = 3978.7
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.12557471 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0628s; samplesPerSecond = 3978.7
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17627924 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0639s; samplesPerSecond = 3914.6
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17627924 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0639s; samplesPerSecond = 3914.6
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17585291 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0644s; samplesPerSecond = 3884.2
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17585291 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0644s; samplesPerSecond = 3884.2
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.14716791 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0628s; samplesPerSecond = 3979.1
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.14716791 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0628s; samplesPerSecond = 3979.1
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16757751 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0643s; samplesPerSecond = 3885.5
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16757751 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0643s; samplesPerSecond = 3885.5
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10314917 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0642s; samplesPerSecond = 3895.3
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10314917 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0642s; samplesPerSecond = 3895.3
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20322217 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0650s; samplesPerSecond = 3848.0
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20322217 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0650s; samplesPerSecond = 3848.0
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16604797 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0642s; samplesPerSecond = 3892.3
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16604797 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0642s; samplesPerSecond = 3892.3
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.15105725 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0651s; samplesPerSecond = 3839.4
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.15105725 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0651s; samplesPerSecond = 3839.4
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19206934 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0640s; samplesPerSecond = 3903.9
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19206934 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0640s; samplesPerSecond = 3903.9
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13667065 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.2688s; samplesPerSecond = 930.0
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13667065 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.2688s; samplesPerSecond = 930.0
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.20713037 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0472s; samplesPerSecond = 5299.3
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.20713037 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0472s; samplesPerSecond = 5299.3
-05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.12862158 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0625s; samplesPerSecond = 3998.5
+05/03/2016 15:21:24:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.12862158 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0625s; samplesPerSecond = 3998.5
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17174683 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0465s; samplesPerSecond = 5381.7
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17174683 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0465s; samplesPerSecond = 5381.7
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16493628 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0526s; samplesPerSecond = 4753.8
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16493628 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0526s; samplesPerSecond = 4753.8
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14843726 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0505s; samplesPerSecond = 4952.5
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14843726 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0505s; samplesPerSecond = 4952.5
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12574292 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0505s; samplesPerSecond = 4951.4
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12574292 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0505s; samplesPerSecond = 4951.4
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13455151 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0614s; samplesPerSecond = 4072.8
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13455151 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0614s; samplesPerSecond = 4072.8
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16762988 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0495s; samplesPerSecond = 5055.0
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16762988 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0495s; samplesPerSecond = 5055.0
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22347461 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0523s; samplesPerSecond = 4780.1
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22347461 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0523s; samplesPerSecond = 4780.1
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18213623 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0542s; samplesPerSecond = 4611.6
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18213623 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0542s; samplesPerSecond = 4611.6
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.19970923 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0539s; samplesPerSecond = 4638.8
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.19970923 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0539s; samplesPerSecond = 4638.8
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22695947 * 250; EvalClassificationError = 0.12800000 * 250; time = 0.0542s; samplesPerSecond = 4609.7
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22695947 * 250; EvalErrorPrediction = 0.12800000 * 250; time = 0.0542s; samplesPerSecond = 4609.7
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12664502 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0541s; samplesPerSecond = 4625.3
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12664502 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0541s; samplesPerSecond = 4625.3
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15838037 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0538s; samplesPerSecond = 4648.8
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15838037 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0538s; samplesPerSecond = 4648.8
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11555566 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0581s; samplesPerSecond = 4305.4
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11555566 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0581s; samplesPerSecond = 4305.4
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14157520 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0544s; samplesPerSecond = 4595.2
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14157520 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0544s; samplesPerSecond = 4595.2
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18558350 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0541s; samplesPerSecond = 4622.4
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18558350 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0541s; samplesPerSecond = 4622.4
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15083594 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0540s; samplesPerSecond = 4632.9
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15083594 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0540s; samplesPerSecond = 4632.9
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12831787 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0541s; samplesPerSecond = 4624.1
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12831787 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0541s; samplesPerSecond = 4624.1
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17656494 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0545s; samplesPerSecond = 4587.6
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17656494 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0545s; samplesPerSecond = 4587.6
-05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14956396 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0625s; samplesPerSecond = 4000.3
+05/03/2016 15:21:25:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14956396 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0625s; samplesPerSecond = 4000.3
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11451660 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0496s; samplesPerSecond = 5040.3
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11451660 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0496s; samplesPerSecond = 5040.3
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16392383 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0496s; samplesPerSecond = 5036.0
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16392383 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0496s; samplesPerSecond = 5036.0
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14811230 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0505s; samplesPerSecond = 4955.0
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14811230 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0505s; samplesPerSecond = 4955.0
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16003760 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0588s; samplesPerSecond = 4255.2
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16003760 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0588s; samplesPerSecond = 4255.2
-05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17969775 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0482s; samplesPerSecond = 5185.4
+05/03/2016 15:21:26:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17969775 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0482s; samplesPerSecond = 5185.4
-05/03/2016 15:21:26: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15964808 * 10000; EvalClassificationError = 0.07750000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=2.49695s
+05/03/2016 15:21:26: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15964808 * 10000; EvalErrorPrediction = 0.07750000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=2.49695s
 05/03/2016 15:21:26: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_cpu/Models/simple.dnn'
 05/03/2016 15:21:26: CNTKCommandTrainEnd: Simple_Demo_Train
@ -608,7 +608,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -637,7 +637,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -661,7 +661,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0x2e83eb8: {[W2 Value[2 x 50]] }
 0x2e87ac8: {[MVNormalizedFeatures Value[2 x *1]] }
 0x2e87e78: {[W0*features Value[50 x *1]] }
@ -676,7 +676,7 @@ Memory Sharing Structure:
 0x2e8d298: {[B2 Value[2 x 1]] }
 0x2e8f2c8: {[labels Value[2 x *1]] }
 0x2e8f8e8: {[MeanOfFeatures Value[2]] }
-0x2e91598: {[EvalClassificationError Value[1]] }
+0x2e91598: {[EvalErrorPrediction Value[1]] }
 0x2e916f8: {[CrossEntropyWithSoftmax Value[1]] }
 0x2e91bb8: {[LogOfPrior Value[2]] }
 0x2e93758: {[B0 Value[50 x 1]] }
@ -686,7 +686,7 @@ Memory Sharing Structure:
 0x2e985f8: {[W1 Value[50 x 50]] }
 0x2e99178: {[features Value[2 x *1]] }
-05/03/2016 15:21:26: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05970149 * 603; CrossEntropyWithSoftmax = 0.13085309 * 603; perplexity = 1.13980032
+05/03/2016 15:21:26: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05970149 * 603; CrossEntropyWithSoftmax = 0.13085309 * 603; perplexity = 1.13980032
 05/03/2016 15:21:26: Action "test" complete.
@ -702,7 +702,7 @@ Post-processing network...
 8 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -732,7 +732,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *2] -> [2 x 1 x *2]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *2], [2 x 1] -> [2 x 1 x *2]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *2] -> [2 x 1 x *2]
 Validating --> Prior = Mean (labels) : [2 x *2] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -755,7 +755,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalClassificationError Gradient[1]] [EvalClassificationError Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalErrorPrediction Gradient[1]] [EvalErrorPrediction Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
 0x2e82858: {[PosteriorProb Value[2 x 1 x *2]] }
 0x2e83b58: {[labels Value[2 x *2]] }
 0x2e84318: {[MeanOfFeatures Value[2]] }
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.linux.gpu.txt
@ -58,7 +58,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -157,7 +157,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -300,7 +300,7 @@ configparameters: Simple.cntk:Simple_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -356,7 +356,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -385,7 +385,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -409,14 +409,14 @@ Post-processing network complete.
 05/03/2016 15:21:27: Evaluation criterion node(s):
-05/03/2016 15:21:27: 	EvalClassificationError = ClassificationError
+05/03/2016 15:21:27: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+(nil): {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 0x1ef9338: {[features Value[2 x *]] }
 0x2b32ad8: {[MeanOfFeatures Value[2]] }
 0x2b32fe8: {[InvStdOfFeatures Value[2]] }
@ -429,7 +429,7 @@ Memory Sharing Structure:
 0x3185898: {[Prior Value[2]] }
 0x3186bd8: {[LogOfPrior Value[2]] }
 0x318b378: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
-0x318b498: {[EvalClassificationError Value[1]] }
+0x318b498: {[EvalErrorPrediction Value[1]] }
 0x318b798: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 0x318b8f8: {[CrossEntropyWithSoftmax Value[1]] }
 0x3191148: {[B0 Value[50 x 1]] }
@ -459,139 +459,139 @@ Memory Sharing Structure:
 05/03/2016 15:21:28: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:28: Starting minibatch loop.
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70004456 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0055s; samplesPerSecond = 45495.9
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70004456 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0055s; samplesPerSecond = 45495.9
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70309900 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54347.8
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.70309900 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54347.8
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70606104 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54241.7
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.70606104 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54241.7
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69845532 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54549.4
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.69845532 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54549.4
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73496533 * 250; EvalClassificationError = 0.57600000 * 250; time = 0.0046s; samplesPerSecond = 54136.0
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.73496533 * 250; EvalErrorPrediction = 0.57600000 * 250; time = 0.0046s; samplesPerSecond = 54136.0
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72522827 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0046s; samplesPerSecond = 54359.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72522827 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0046s; samplesPerSecond = 54359.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73287500 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.73287500 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70135547 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54872.7
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.70135547 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54872.7
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72466504 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 54194.7
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.72466504 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 54194.7
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72187500 * 250; EvalClassificationError = 0.52000000 * 250; time = 0.0046s; samplesPerSecond = 54501.9
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.72187500 * 250; EvalErrorPrediction = 0.52000000 * 250; time = 0.0046s; samplesPerSecond = 54501.9
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69799023 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54788.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.69799023 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54788.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70696387 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0046s; samplesPerSecond = 54371.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.70696387 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0046s; samplesPerSecond = 54371.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69863965 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54300.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.69863965 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54300.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71772461 * 250; EvalClassificationError = 0.54800000 * 250; time = 0.0046s; samplesPerSecond = 54644.8
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.71772461 * 250; EvalErrorPrediction = 0.54800000 * 250; time = 0.0046s; samplesPerSecond = 54644.8
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69526270 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54525.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.69526270 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0046s; samplesPerSecond = 54525.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71436426 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.71436426 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70399316 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0046s; samplesPerSecond = 54573.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70399316 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0046s; samplesPerSecond = 54573.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71745508 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 54716.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.71745508 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0046s; samplesPerSecond = 54716.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71963184 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.71963184 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70689941 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54336.0
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.70689941 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54336.0
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70425098 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54692.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.70425098 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54692.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70622754 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70622754 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69729492 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69729492 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75974219 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54680.7
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.75974219 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0046s; samplesPerSecond = 54680.7
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70631250 * 250; EvalClassificationError = 0.43600000 * 250; time = 0.0046s; samplesPerSecond = 54288.8
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.70631250 * 250; EvalErrorPrediction = 0.43600000 * 250; time = 0.0046s; samplesPerSecond = 54288.8
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70705664 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.70705664 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72660352 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.72660352 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71369727 * 250; EvalClassificationError = 0.55600000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.71369727 * 250; EvalErrorPrediction = 0.55600000 * 250; time = 0.0046s; samplesPerSecond = 54537.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68916602 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0046s; samplesPerSecond = 54371.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.68916602 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0046s; samplesPerSecond = 54371.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69964844 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0046s; samplesPerSecond = 54218.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69964844 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0046s; samplesPerSecond = 54218.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.69387891 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.69387891 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.68885742 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0046s; samplesPerSecond = 54573.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.68885742 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0046s; samplesPerSecond = 54573.2
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69388867 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54454.4
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69388867 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0046s; samplesPerSecond = 54454.4
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.70363867 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.70363867 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65449219 * 250; EvalClassificationError = 0.44400000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65449219 * 250; EvalErrorPrediction = 0.44400000 * 250; time = 0.0046s; samplesPerSecond = 54561.3
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64607031 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 54347.8
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64607031 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0046s; samplesPerSecond = 54347.8
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.59492969 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.59492969 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.53965820 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54609.0
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.53965820 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54609.0
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.43681445 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0046s; samplesPerSecond = 54525.6
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.43681445 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0046s; samplesPerSecond = 54525.6
-05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37407422 * 250; EvalClassificationError = 0.12000000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
+05/03/2016 15:21:28:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37407422 * 250; EvalErrorPrediction = 0.12000000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
-05/03/2016 15:21:28: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68409629 * 10000; EvalClassificationError = 0.45780000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.1879s
+05/03/2016 15:21:28: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68409629 * 10000; EvalErrorPrediction = 0.45780000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.1879s
 05/03/2016 15:21:28: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_gpu/Models/simple.dnn.1'
 05/03/2016 15:21:28: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:28: Starting minibatch loop.
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.27895840 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0046s; samplesPerSecond = 53902.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.27895840 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0046s; samplesPerSecond = 53902.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.24395615 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54933.0
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.24395615 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54933.0
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.19587115 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.19587115 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16368213 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 55126.8
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16368213 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 55126.8
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.19700140 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54933.0
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.19700140 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54933.0
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.19580530 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54585.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.19580530 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54585.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.18257983 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.18257983 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17520911 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54752.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.17520911 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54752.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20164514 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0046s; samplesPerSecond = 54752.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20164514 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0046s; samplesPerSecond = 54752.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19787024 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.19787024 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0046s; samplesPerSecond = 54466.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.13437573 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0045s; samplesPerSecond = 55090.3
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.13437573 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0045s; samplesPerSecond = 55090.3
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.19004956 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54848.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.19004956 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54848.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.12287280 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 54957.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.12287280 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 54957.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16975903 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55175.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16975903 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55175.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16102686 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54513.7
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16102686 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54513.7
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18611646 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.18611646 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18469507 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 55334.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.18469507 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 55334.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18472339 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18472339 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20064648 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54597.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.20064648 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54597.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13324683 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.13324683 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13878418 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13878418 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15587354 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54920.9
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.15587354 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0046s; samplesPerSecond = 54920.9
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15337378 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54812.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15337378 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54812.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14797070 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 55199.8
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.14797070 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 55199.8
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12512891 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0046s; samplesPerSecond = 54383.3
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.12512891 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0046s; samplesPerSecond = 54383.3
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14058545 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14058545 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12611963 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12611963 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18970605 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.18970605 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17965479 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.17965479 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18866455 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0046s; samplesPerSecond = 54836.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.18866455 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0046s; samplesPerSecond = 54836.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17539941 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17539941 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14742432 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54848.6
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.14742432 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54848.6
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13789502 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0046s; samplesPerSecond = 54788.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13789502 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0046s; samplesPerSecond = 54788.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.13652100 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0045s; samplesPerSecond = 55224.2
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.13652100 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0045s; samplesPerSecond = 55224.2
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13619336 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54920.9
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13619336 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54920.9
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.14909424 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54478.1
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.14909424 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54478.1
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14762256 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 55139.0
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.14762256 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 55139.0
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13142578 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54860.7
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.13142578 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54860.7
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19570459 * 250; EvalClassificationError = 0.11600000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.19570459 * 250; EvalErrorPrediction = 0.11600000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
-05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15718604 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55005.5
+05/03/2016 15:21:28:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15718604 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55005.5
-05/03/2016 15:21:28: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16901047 * 10000; EvalClassificationError = 0.07510000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.184798s
+05/03/2016 15:21:28: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.16901047 * 10000; EvalErrorPrediction = 0.07510000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.184798s
 05/03/2016 15:21:28: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_gpu/Models/simple.dnn.2'
 05/03/2016 15:21:28: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 15:21:28: Starting minibatch loop.
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18133401 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54124.3
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.18133401 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54124.3
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13605756 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.13605756 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14345651 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54668.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14345651 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54668.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.12512610 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.12512610 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0045s; samplesPerSecond = 54969.2
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17690991 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.17690991 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17504150 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54740.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.17504150 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0046s; samplesPerSecond = 54740.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.14723834 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 55224.2
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.14723834 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 55224.2
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16752893 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.16752893 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10317773 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.10317773 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0046s; samplesPerSecond = 54800.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20306372 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.20306372 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16637036 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 55066.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.16637036 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0045s; samplesPerSecond = 55066.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.15126868 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.15126868 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54824.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19167224 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.19167224 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54884.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13687085 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 55420.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13687085 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 55420.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.20709912 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0046s; samplesPerSecond = 54740.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.20709912 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0046s; samplesPerSecond = 54740.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.12918774 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0045s; samplesPerSecond = 54981.3
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.12918774 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0045s; samplesPerSecond = 54981.3
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17185107 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 55322.0
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17185107 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 55322.0
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16523242 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16523242 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14880249 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54728.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.14880249 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0046s; samplesPerSecond = 54728.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12590967 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 54957.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.12590967 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0045s; samplesPerSecond = 54957.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13443018 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54872.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.13443018 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54872.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16726147 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54836.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16726147 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54836.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22407422 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0045s; samplesPerSecond = 55041.8
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.22407422 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0045s; samplesPerSecond = 55041.8
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18191553 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.18191553 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.19983057 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54680.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.19983057 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0046s; samplesPerSecond = 54680.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22728223 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 54692.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.22728223 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0046s; samplesPerSecond = 54692.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12720459 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0045s; samplesPerSecond = 55151.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.12720459 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0045s; samplesPerSecond = 55151.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15842871 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.15842871 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11558691 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.11558691 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0045s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14163428 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.14163428 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0045s; samplesPerSecond = 55248.6
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18560596 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.18560596 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0045s; samplesPerSecond = 54993.4
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15099561 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.15099561 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0045s; samplesPerSecond = 55078.2
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12822461 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54395.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.12822461 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0046s; samplesPerSecond = 54395.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17662500 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0045s; samplesPerSecond = 55309.7
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.17662500 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0045s; samplesPerSecond = 55309.7
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14950781 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0046s; samplesPerSecond = 54945.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.14950781 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0046s; samplesPerSecond = 54945.1
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11450977 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.11450977 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0046s; samplesPerSecond = 54908.9
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16386768 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0045s; samplesPerSecond = 55260.8
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16386768 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0045s; samplesPerSecond = 55260.8
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14811523 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 54981.3
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14811523 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0045s; samplesPerSecond = 54981.3
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16021143 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16021143 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0046s; samplesPerSecond = 54764.5
-05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17989551 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0045s; samplesPerSecond = 55151.1
+05/03/2016 15:21:28:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.17989551 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0045s; samplesPerSecond = 55151.1
-05/03/2016 15:21:28: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15971016 * 10000; EvalClassificationError = 0.07740000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.184406s
+05/03/2016 15:21:28: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15971016 * 10000; EvalErrorPrediction = 0.07740000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.184406s
 05/03/2016 15:21:28: SGD: Saving checkpoint model '/tmp/cntk-test-20160503152115.267374/CNTKTextFormatReader/Examples/Other/Simple2d_Simple@release_gpu/Models/simple.dnn'
 05/03/2016 15:21:29: CNTKCommandTrainEnd: Simple_Demo_Train
@ -609,7 +609,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -638,7 +638,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -662,11 +662,11 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 0x1efcc08: {[B2 Value[2 x 1]] }
 0x1efd8c8: {[W0 Value[50 x 2]] }
 0x1efee68: {[InvStdOfFeatures Value[2]] }
-0x2b337e8: {[EvalClassificationError Value[1]] }
+0x2b337e8: {[EvalErrorPrediction Value[1]] }
 0x2b33948: {[CrossEntropyWithSoftmax Value[1]] }
 0x2b33f08: {[LogOfPrior Value[2]] }
 0x31808e8: {[W2 Value[2 x 50]] }
@ -687,7 +687,7 @@ Memory Sharing Structure:
 0x7273058: {[W2*H1 Value[2 x 1 x *1]] }
 0x7273218: {[HLast Value[2 x 1 x *1]] }
-05/03/2016 15:21:29: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05970149 * 603; CrossEntropyWithSoftmax = 0.13093129 * 603; perplexity = 1.13988946
+05/03/2016 15:21:29: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05970149 * 603; CrossEntropyWithSoftmax = 0.13093129 * 603; perplexity = 1.13988946
 05/03/2016 15:21:29: Action "test" complete.
@ -703,7 +703,7 @@ Post-processing network...
 8 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -733,7 +733,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *2] -> [2 x 1 x *2]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *2], [2 x 1] -> [2 x 1 x *2]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *2] -> [2 x 1 x *2]
 Validating --> Prior = Mean (labels) : [2 x *2] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -756,7 +756,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalClassificationError Gradient[1]] [EvalClassificationError Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
+(nil): {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalErrorPrediction Gradient[1]] [EvalErrorPrediction Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
 0x1efcef8: {[features Value[2 x *2]] }
 0x1efe2c8: {[labels Value[2 x *2]] }
 0x1eff188: {[PosteriorProb Value[2 x 1 x *2]] }
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.windows.cpu.txt
@ -56,7 +56,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -155,7 +155,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -298,7 +298,7 @@ configparameters: Simple.cntk:Simple_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -353,7 +353,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -382,7 +382,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -406,14 +406,14 @@ Post-processing network complete.
 05/03/2016 13:12:46: Evaluation criterion node(s):
-05/03/2016 13:12:46: 	EvalClassificationError = ClassificationError
+05/03/2016 13:12:46: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+0000000000000000: {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 000000702B410E90: {[features Value[2 x *]] }
 000000702B44E0C0: {[W0 Value[50 x 2]] }
 000000702B4D76F0: {[H2 Value[50 x 1 x *]] [W1*H1 Gradient[50 x 1 x *]] }
@ -428,7 +428,7 @@ Memory Sharing Structure:
 000000702B4D8690: {[B0 Gradient[50 x 1]] [H1 Gradient[50 x 1 x *]] [W1*H1+B1 Gradient[50 x 1 x *]] [W2*H1 Value[2 x 1 x *]] }
 000000702B4D8730: {[HLast Value[2 x 1 x *]] [W2 Gradient[2 x 50]] }
 000000702B4D89B0: {[CrossEntropyWithSoftmax Value[1]] }
-000000702B4D8AF0: {[EvalClassificationError Value[1]] }
+000000702B4D8AF0: {[EvalErrorPrediction Value[1]] }
 000000702B4D8B90: {[H1 Value[50 x 1 x *]] [W0*features Gradient[50 x *]] }
 000000702B4D8F50: {[B2 Gradient[2 x 1]] }
 000000702B4D91D0: {[ScaledLogLikelihood Value[2 x 1 x *]] }
@ -456,139 +456,139 @@ Memory Sharing Structure:
 05/03/2016 13:12:47: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 13:12:47: Starting minibatch loop.
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70511987 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0327s; samplesPerSecond = 7657.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70511987 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0327s; samplesPerSecond = 7657.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69754895 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9726.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69754895 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9726.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71056921 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0248s; samplesPerSecond = 10096.1
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71056921 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0248s; samplesPerSecond = 10096.1
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72951074 * 250; EvalClassificationError = 0.56000000 * 250; time = 0.0245s; samplesPerSecond = 10210.3
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72951074 * 250; EvalErrorPrediction = 0.56000000 * 250; time = 0.0245s; samplesPerSecond = 10210.3
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70946655 * 250; EvalClassificationError = 0.48800000 * 250; time = 0.0249s; samplesPerSecond = 10032.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70946655 * 250; EvalErrorPrediction = 0.48800000 * 250; time = 0.0249s; samplesPerSecond = 10032.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72656787 * 250; EvalClassificationError = 0.54400000 * 250; time = 0.0248s; samplesPerSecond = 10065.2
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72656787 * 250; EvalErrorPrediction = 0.54400000 * 250; time = 0.0248s; samplesPerSecond = 10065.2
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69337402 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0256s; samplesPerSecond = 9766.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69337402 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0256s; samplesPerSecond = 9766.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73605176 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0259s; samplesPerSecond = 9662.6
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73605176 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0259s; samplesPerSecond = 9662.6
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71453076 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0239s; samplesPerSecond = 10469.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71453076 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0239s; samplesPerSecond = 10469.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75191992 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0255s; samplesPerSecond = 9802.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75191992 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0255s; samplesPerSecond = 9802.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75975146 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0248s; samplesPerSecond = 10100.6
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75975146 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0248s; samplesPerSecond = 10100.6
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73172168 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0255s; samplesPerSecond = 9808.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73172168 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0255s; samplesPerSecond = 9808.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76840820 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0261s; samplesPerSecond = 9593.2
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76840820 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0261s; samplesPerSecond = 9593.2
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70464746 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0255s; samplesPerSecond = 9807.4
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70464746 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0255s; samplesPerSecond = 9807.4
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70557227 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0242s; samplesPerSecond = 10340.4
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70557227 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0242s; samplesPerSecond = 10340.4
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72711816 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0249s; samplesPerSecond = 10049.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72711816 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0249s; samplesPerSecond = 10049.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70076660 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0247s; samplesPerSecond = 10117.4
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70076660 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0247s; samplesPerSecond = 10117.4
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69409766 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0254s; samplesPerSecond = 9834.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69409766 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0254s; samplesPerSecond = 9834.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69139941 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0243s; samplesPerSecond = 10275.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69139941 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0243s; samplesPerSecond = 10275.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73361621 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0255s; samplesPerSecond = 9802.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73361621 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0255s; samplesPerSecond = 9802.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72225879 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0246s; samplesPerSecond = 10146.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72225879 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0246s; samplesPerSecond = 10146.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70356348 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0243s; samplesPerSecond = 10286.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70356348 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0243s; samplesPerSecond = 10286.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69928613 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0252s; samplesPerSecond = 9909.2
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69928613 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0252s; samplesPerSecond = 9909.2
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72360938 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0244s; samplesPerSecond = 10227.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72360938 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0244s; samplesPerSecond = 10227.0
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69871875 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0244s; samplesPerSecond = 10243.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69871875 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0244s; samplesPerSecond = 10243.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69114844 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0248s; samplesPerSecond = 10081.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69114844 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0248s; samplesPerSecond = 10081.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68648047 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0254s; samplesPerSecond = 9844.5
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68648047 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0254s; samplesPerSecond = 9844.5
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69657227 * 250; EvalClassificationError = 0.46400000 * 250; time = 0.0258s; samplesPerSecond = 9679.8
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69657227 * 250; EvalErrorPrediction = 0.46400000 * 250; time = 0.0258s; samplesPerSecond = 9679.8
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71585547 * 250; EvalClassificationError = 0.45200000 * 250; time = 0.0255s; samplesPerSecond = 9798.2
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71585547 * 250; EvalErrorPrediction = 0.45200000 * 250; time = 0.0255s; samplesPerSecond = 9798.2
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69730664 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0260s; samplesPerSecond = 9609.1
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.69730664 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0260s; samplesPerSecond = 9609.1
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70432422 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0265s; samplesPerSecond = 9431.1
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70432422 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0265s; samplesPerSecond = 9431.1
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.69991797 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9722.7
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.69991797 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0257s; samplesPerSecond = 9722.7
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.68696875 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0259s; samplesPerSecond = 9647.3
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.68696875 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0259s; samplesPerSecond = 9647.3
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.67331445 * 250; EvalClassificationError = 0.37200000 * 250; time = 0.0267s; samplesPerSecond = 9364.7
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.67331445 * 250; EvalErrorPrediction = 0.37200000 * 250; time = 0.0267s; samplesPerSecond = 9364.7
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65711328 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0258s; samplesPerSecond = 9700.1
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.65711328 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0258s; samplesPerSecond = 9700.1
-05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64534375 * 250; EvalClassificationError = 0.44800000 * 250; time = 0.0260s; samplesPerSecond = 9608.0
+05/03/2016 13:12:47:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.64534375 * 250; EvalErrorPrediction = 0.44800000 * 250; time = 0.0260s; samplesPerSecond = 9608.0
-05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.61021875 * 250; EvalClassificationError = 0.36400000 * 250; time = 0.0263s; samplesPerSecond = 9515.5
+05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.61021875 * 250; EvalErrorPrediction = 0.36400000 * 250; time = 0.0263s; samplesPerSecond = 9515.5
-05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.54191016 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0229s; samplesPerSecond = 10907.5
+05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.54191016 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0229s; samplesPerSecond = 10907.5
-05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.45624414 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0239s; samplesPerSecond = 10479.5
+05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.45624414 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0239s; samplesPerSecond = 10479.5
-05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37636133 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0229s; samplesPerSecond = 10917.0
+05/03/2016 13:12:48:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.37636133 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0229s; samplesPerSecond = 10917.0
-05/03/2016 13:12:48: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68695688 * 10000; EvalClassificationError = 0.45550000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=1.01718s
+05/03/2016 13:12:48: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.68695688 * 10000; EvalErrorPrediction = 0.45550000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=1.01718s
 05/03/2016 13:12:48: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503141245.787579\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_cpu/Models/simple.dnn.1'
 05/03/2016 13:12:48: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 13:12:48: Starting minibatch loop.
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.28579105 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0228s; samplesPerSecond = 10943.3
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.28579105 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0228s; samplesPerSecond = 10943.3
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.27768619 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0230s; samplesPerSecond = 10860.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.27768619 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0230s; samplesPerSecond = 10860.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.23309790 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0223s; samplesPerSecond = 11187.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.23309790 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0223s; samplesPerSecond = 11187.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.20937585 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0221s; samplesPerSecond = 11327.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.20937585 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0221s; samplesPerSecond = 11327.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.20192059 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0225s; samplesPerSecond = 11116.5
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.20192059 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0225s; samplesPerSecond = 11116.5
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.21303992 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0232s; samplesPerSecond = 10762.9
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.21303992 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0232s; samplesPerSecond = 10762.9
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.17823340 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0247s; samplesPerSecond = 10120.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.17823340 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0247s; samplesPerSecond = 10120.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.18892688 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0231s; samplesPerSecond = 10816.4
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.18892688 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0231s; samplesPerSecond = 10816.4
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.14161328 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0225s; samplesPerSecond = 11100.8
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.14161328 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0225s; samplesPerSecond = 11100.8
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.15813574 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0226s; samplesPerSecond = 11077.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.15813574 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0226s; samplesPerSecond = 11077.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.21082446 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0233s; samplesPerSecond = 10728.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.21082446 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0233s; samplesPerSecond = 10728.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.16117041 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0229s; samplesPerSecond = 10928.0
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.16117041 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0229s; samplesPerSecond = 10928.0
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15665234 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0223s; samplesPerSecond = 11195.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15665234 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0223s; samplesPerSecond = 11195.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13067773 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0226s; samplesPerSecond = 11047.3
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13067773 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0226s; samplesPerSecond = 11047.3
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16602710 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0212s; samplesPerSecond = 11796.9
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16602710 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0212s; samplesPerSecond = 11796.9
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.14975708 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0215s; samplesPerSecond = 11641.4
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.14975708 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0215s; samplesPerSecond = 11641.4
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22351709 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0214s; samplesPerSecond = 11708.5
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22351709 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0214s; samplesPerSecond = 11708.5
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18010474 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0207s; samplesPerSecond = 12085.5
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18010474 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0207s; samplesPerSecond = 12085.5
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15341577 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0207s; samplesPerSecond = 12072.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15341577 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0207s; samplesPerSecond = 12072.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17195337 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0209s; samplesPerSecond = 11976.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17195337 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0209s; samplesPerSecond = 11976.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15546069 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0217s; samplesPerSecond = 11534.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15546069 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0217s; samplesPerSecond = 11534.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16008325 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0214s; samplesPerSecond = 11689.3
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16008325 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0214s; samplesPerSecond = 11689.3
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15944043 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0209s; samplesPerSecond = 11981.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.15944043 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0209s; samplesPerSecond = 11981.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15336865 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0207s; samplesPerSecond = 12102.4
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15336865 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0207s; samplesPerSecond = 12102.4
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14822266 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0212s; samplesPerSecond = 11766.4
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14822266 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0212s; samplesPerSecond = 11766.4
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14999512 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0211s; samplesPerSecond = 11833.2
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.14999512 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0211s; samplesPerSecond = 11833.2
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15481982 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0208s; samplesPerSecond = 11992.7
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15481982 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0208s; samplesPerSecond = 11992.7
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17656738 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0204s; samplesPerSecond = 12229.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17656738 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0204s; samplesPerSecond = 12229.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22373242 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0213s; samplesPerSecond = 11738.7
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22373242 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0213s; samplesPerSecond = 11738.7
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16403760 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0211s; samplesPerSecond = 11856.8
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16403760 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0211s; samplesPerSecond = 11856.8
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17322168 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0211s; samplesPerSecond = 11868.0
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17322168 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0211s; samplesPerSecond = 11868.0
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13165430 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0205s; samplesPerSecond = 12202.3
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13165430 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0205s; samplesPerSecond = 12202.3
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14016992 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0208s; samplesPerSecond = 11993.9
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.14016992 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0208s; samplesPerSecond = 11993.9
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18369678 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0214s; samplesPerSecond = 11657.7
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18369678 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0214s; samplesPerSecond = 11657.7
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15161035 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0215s; samplesPerSecond = 11612.8
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15161035 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0215s; samplesPerSecond = 11612.8
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.18919824 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0215s; samplesPerSecond = 11632.8
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.18919824 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0215s; samplesPerSecond = 11632.8
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17373975 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0212s; samplesPerSecond = 11818.1
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17373975 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0212s; samplesPerSecond = 11818.1
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15033740 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0208s; samplesPerSecond = 12036.6
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15033740 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0208s; samplesPerSecond = 12036.6
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12107568 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0207s; samplesPerSecond = 12075.5
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12107568 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0207s; samplesPerSecond = 12075.5
-05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15386328 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0227s; samplesPerSecond = 10997.7
+05/03/2016 13:12:48:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15386328 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0227s; samplesPerSecond = 10997.7
-05/03/2016 13:12:48: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.17515541 * 10000; EvalClassificationError = 0.07440000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.87149s
+05/03/2016 13:12:48: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.17515541 * 10000; EvalErrorPrediction = 0.07440000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.87149s
 05/03/2016 13:12:48: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503141245.787579\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_cpu/Models/simple.dnn.2'
 05/03/2016 13:12:48: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 13:12:48: Starting minibatch loop.
-05/03/2016 13:12:48:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10671188 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0217s; samplesPerSecond = 11511.2
+05/03/2016 13:12:48:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10671188 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0217s; samplesPerSecond = 11511.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17609265 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0205s; samplesPerSecond = 12183.8
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17609265 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0205s; samplesPerSecond = 12183.8
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14152701 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0208s; samplesPerSecond = 12001.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14152701 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0208s; samplesPerSecond = 12001.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16348053 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0213s; samplesPerSecond = 11748.1
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16348053 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0213s; samplesPerSecond = 11748.1
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11764551 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0219s; samplesPerSecond = 11435.4
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11764551 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0219s; samplesPerSecond = 11435.4
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16246954 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0212s; samplesPerSecond = 11811.4
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16246954 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0212s; samplesPerSecond = 11811.4
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16140149 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0207s; samplesPerSecond = 12078.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16140149 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0207s; samplesPerSecond = 12078.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19747632 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0202s; samplesPerSecond = 12391.0
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19747632 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0202s; samplesPerSecond = 12391.0
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20041309 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0214s; samplesPerSecond = 11659.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.20041309 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0214s; samplesPerSecond = 11659.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13657080 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0208s; samplesPerSecond = 12033.7
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13657080 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0208s; samplesPerSecond = 12033.7
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20124377 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0203s; samplesPerSecond = 12293.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20124377 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0203s; samplesPerSecond = 12293.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17898120 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0206s; samplesPerSecond = 12144.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17898120 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0206s; samplesPerSecond = 12144.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16037830 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0232s; samplesPerSecond = 10779.1
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16037830 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0232s; samplesPerSecond = 10779.1
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16276050 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0214s; samplesPerSecond = 11704.7
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16276050 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0214s; samplesPerSecond = 11704.7
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19882275 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0218s; samplesPerSecond = 11454.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19882275 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0218s; samplesPerSecond = 11454.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10263354 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0208s; samplesPerSecond = 12041.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10263354 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0208s; samplesPerSecond = 12041.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17038770 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0213s; samplesPerSecond = 11725.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17038770 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0213s; samplesPerSecond = 11725.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16624731 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0209s; samplesPerSecond = 11958.3
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16624731 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0209s; samplesPerSecond = 11958.3
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12664160 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0213s; samplesPerSecond = 11723.3
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12664160 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0213s; samplesPerSecond = 11723.3
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11944995 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0213s; samplesPerSecond = 11733.8
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11944995 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0213s; samplesPerSecond = 11733.8
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12949756 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0208s; samplesPerSecond = 11996.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12949756 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0208s; samplesPerSecond = 11996.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18147778 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0222s; samplesPerSecond = 11242.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18147778 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0222s; samplesPerSecond = 11242.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13172412 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0233s; samplesPerSecond = 10719.0
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13172412 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0233s; samplesPerSecond = 10719.0
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19600269 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0238s; samplesPerSecond = 10521.0
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19600269 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0238s; samplesPerSecond = 10521.0
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15840479 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0226s; samplesPerSecond = 11084.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15840479 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0226s; samplesPerSecond = 11084.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11888281 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0225s; samplesPerSecond = 11129.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11888281 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0225s; samplesPerSecond = 11129.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13710742 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0222s; samplesPerSecond = 11251.1
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13710742 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0222s; samplesPerSecond = 11251.1
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20026318 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0233s; samplesPerSecond = 10730.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20026318 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0233s; samplesPerSecond = 10730.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.18824951 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0223s; samplesPerSecond = 11227.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.18824951 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0223s; samplesPerSecond = 11227.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16653223 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0225s; samplesPerSecond = 11096.3
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16653223 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0225s; samplesPerSecond = 11096.3
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.11935254 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0229s; samplesPerSecond = 10918.5
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.11935254 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0229s; samplesPerSecond = 10918.5
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16085400 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0225s; samplesPerSecond = 11132.9
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16085400 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0225s; samplesPerSecond = 11132.9
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.16112646 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0219s; samplesPerSecond = 11439.6
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.16112646 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0219s; samplesPerSecond = 11439.6
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12345313 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0229s; samplesPerSecond = 10904.6
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12345313 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0229s; samplesPerSecond = 10904.6
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13502686 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0226s; samplesPerSecond = 11075.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13502686 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0226s; samplesPerSecond = 11075.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20874756 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0224s; samplesPerSecond = 11185.2
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20874756 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0224s; samplesPerSecond = 11185.2
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16650537 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0227s; samplesPerSecond = 11009.3
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16650537 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0227s; samplesPerSecond = 11009.3
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14995752 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0206s; samplesPerSecond = 12134.7
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.14995752 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0206s; samplesPerSecond = 12134.7
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16497070 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0209s; samplesPerSecond = 11953.7
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16497070 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0209s; samplesPerSecond = 11953.7
-05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16843018 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0210s; samplesPerSecond = 11912.1
+05/03/2016 13:12:49:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16843018 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0210s; samplesPerSecond = 11912.1
-05/03/2016 13:12:49: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15794755 * 10000; EvalClassificationError = 0.07480000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.871499s
+05/03/2016 13:12:49: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15794755 * 10000; EvalErrorPrediction = 0.07480000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.871499s
 05/03/2016 13:12:49: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503141245.787579\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_cpu/Models/simple.dnn'
 05/03/2016 13:12:49: CNTKCommandTrainEnd: Simple_Demo_Train
@ -606,7 +606,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -635,7 +635,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -659,7 +659,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 00000070343C5200: {[InvStdOfFeatures Value[2]] }
 00000070343C5340: {[Prior Value[2]] }
 00000070343C53E0: {[W0 Value[50 x 2]] }
@ -671,7 +671,7 @@ Memory Sharing Structure:
 000000703442D030: {[HLast Value[2 x 1 x *1]] }
 000000703442D0D0: {[W0*features Value[50 x *1]] }
 000000703442D170: {[W1*H1+B1 Value[50 x 1 x *1]] }
-000000703442D2B0: {[EvalClassificationError Value[1]] }
+000000703442D2B0: {[EvalErrorPrediction Value[1]] }
 000000703442D530: {[CrossEntropyWithSoftmax Value[1]] }
 000000703442D5D0: {[W2 Value[2 x 50]] }
 000000703442D670: {[LogOfPrior Value[2]] }
@ -684,7 +684,7 @@ Memory Sharing Structure:
 0000007034432340: {[B0 Value[50 x 1]] }
 0000007034432480: {[B2 Value[2 x 1]] }
-05/03/2016 13:12:50: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12474995 * 603; perplexity = 1.13286515
+05/03/2016 13:12:50: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12474995 * 603; perplexity = 1.13286515
 05/03/2016 13:12:50: Action "test" complete.
@ -700,7 +700,7 @@ Post-processing network...
 8 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -730,7 +730,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *2] -> [2 x 1 x *2]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *2], [2 x 1] -> [2 x 1 x *2]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *2] -> [2 x 1 x *2]
 Validating --> Prior = Mean (labels) : [2 x *2] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -753,7 +753,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalClassificationError Gradient[1]] [EvalClassificationError Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalErrorPrediction Gradient[1]] [EvalErrorPrediction Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
 000000702E3275E0: {[H2 Value[50 x 1 x *2]] }
 000000702E327680: {[W2*H1 Value[2 x 1 x *2]] }
 000000702E3277C0: {[LogOfPrior Value[2]] }
--- a/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Other/Simple2d/Simple/baseline.windows.gpu.txt
@ -56,7 +56,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -155,7 +155,7 @@ Simple_Demo_Train = [
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -298,7 +298,7 @@ configparameters: Simple.cntk:Simple_Demo_Train=[
    SimpleNetworkBuilder = [
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -354,7 +354,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -383,7 +383,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *] -> [2 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *], [2 x 1] -> [2 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *], [2 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *] -> [2 x 1 x *]
 Validating --> Prior = Mean (labels) : [2 x *] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -407,14 +407,14 @@ Post-processing network complete.
 05/03/2016 13:01:59: Evaluation criterion node(s):
-05/03/2016 13:01:59: 	EvalClassificationError = ClassificationError
+05/03/2016 13:01:59: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
+0000000000000000: {[EvalErrorPrediction Gradient[1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *]] [PosteriorProb Value[2 x 1 x *]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *]] [features Gradient[2 x *]] [labels Gradient[2 x *]] }
 000000501A590FF0: {[W2 Value[2 x 50]] }
 000000501A591090: {[W0 Value[50 x 2]] }
 000000501A5919F0: {[B1 Value[50 x 1]] }
@ -427,7 +427,7 @@ Memory Sharing Structure:
 000000501A5A1180: {[ScaledLogLikelihood Value[2 x 1 x *]] }
 000000501A5A1220: {[B0 Gradient[50 x 1]] [H1 Gradient[50 x 1 x *]] [W1*H1+B1 Gradient[50 x 1 x *]] [W2*H1 Value[2 x 1 x *]] }
 000000501A5A17C0: {[W0 Gradient[50 x 2]] [W0*features+B0 Value[50 x 1 x *]] }
-000000501A5A1900: {[EvalClassificationError Value[1]] }
+000000501A5A1900: {[EvalErrorPrediction Value[1]] }
 000000501A5A19A0: {[W0*features Value[50 x *]] }
 000000501A5A1A40: {[W2*H1 Gradient[2 x 1 x *]] }
 000000501A5A1F40: {[MVNormalizedFeatures Value[2 x *]] }
@ -457,139 +457,139 @@ Memory Sharing Structure:
 05/03/2016 13:01:59: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 13:01:59: Starting minibatch loop.
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70650452 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0123s; samplesPerSecond = 20247.8
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[   1-  10]: CrossEntropyWithSoftmax = 0.70650452 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0123s; samplesPerSecond = 20247.8
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69701831 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0112s; samplesPerSecond = 22393.4
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  11-  20]: CrossEntropyWithSoftmax = 0.69701831 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0112s; samplesPerSecond = 22393.4
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71089587 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0126s; samplesPerSecond = 19907.6
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  21-  30]: CrossEntropyWithSoftmax = 0.71089587 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0126s; samplesPerSecond = 19907.6
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72980273 * 250; EvalClassificationError = 0.56000000 * 250; time = 0.0113s; samplesPerSecond = 22042.0
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  31-  40]: CrossEntropyWithSoftmax = 0.72980273 * 250; EvalErrorPrediction = 0.56000000 * 250; time = 0.0113s; samplesPerSecond = 22042.0
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70902783 * 250; EvalClassificationError = 0.52800000 * 250; time = 0.0131s; samplesPerSecond = 19088.3
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  41-  50]: CrossEntropyWithSoftmax = 0.70902783 * 250; EvalErrorPrediction = 0.52800000 * 250; time = 0.0131s; samplesPerSecond = 19088.3
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72657300 * 250; EvalClassificationError = 0.54400000 * 250; time = 0.0138s; samplesPerSecond = 18059.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  51-  60]: CrossEntropyWithSoftmax = 0.72657300 * 250; EvalErrorPrediction = 0.54400000 * 250; time = 0.0138s; samplesPerSecond = 18059.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69319678 * 250; EvalClassificationError = 0.43200000 * 250; time = 0.0148s; samplesPerSecond = 16917.0
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  61-  70]: CrossEntropyWithSoftmax = 0.69319678 * 250; EvalErrorPrediction = 0.43200000 * 250; time = 0.0148s; samplesPerSecond = 16917.0
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73563477 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0164s; samplesPerSecond = 15236.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  71-  80]: CrossEntropyWithSoftmax = 0.73563477 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0164s; samplesPerSecond = 15236.5
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71463281 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0123s; samplesPerSecond = 20321.9
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  81-  90]: CrossEntropyWithSoftmax = 0.71463281 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0123s; samplesPerSecond = 20321.9
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75213428 * 250; EvalClassificationError = 0.47200000 * 250; time = 0.0167s; samplesPerSecond = 14944.1
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[  91- 100]: CrossEntropyWithSoftmax = 0.75213428 * 250; EvalErrorPrediction = 0.47200000 * 250; time = 0.0167s; samplesPerSecond = 14944.1
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75931445 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0131s; samplesPerSecond = 19105.8
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 101- 110]: CrossEntropyWithSoftmax = 0.75931445 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0131s; samplesPerSecond = 19105.8
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73075293 * 250; EvalClassificationError = 0.50800000 * 250; time = 0.0132s; samplesPerSecond = 18886.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 111- 120]: CrossEntropyWithSoftmax = 0.73075293 * 250; EvalErrorPrediction = 0.50800000 * 250; time = 0.0132s; samplesPerSecond = 18886.5
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76701953 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0128s; samplesPerSecond = 19574.1
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 121- 130]: CrossEntropyWithSoftmax = 0.76701953 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0128s; samplesPerSecond = 19574.1
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70451270 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0128s; samplesPerSecond = 19467.4
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 131- 140]: CrossEntropyWithSoftmax = 0.70451270 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0128s; samplesPerSecond = 19467.4
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70539941 * 250; EvalClassificationError = 0.50400000 * 250; time = 0.0143s; samplesPerSecond = 17444.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 141- 150]: CrossEntropyWithSoftmax = 0.70539941 * 250; EvalErrorPrediction = 0.50400000 * 250; time = 0.0143s; samplesPerSecond = 17444.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72700293 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0123s; samplesPerSecond = 20391.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 151- 160]: CrossEntropyWithSoftmax = 0.72700293 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0123s; samplesPerSecond = 20391.5
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70096191 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0143s; samplesPerSecond = 17465.4
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 161- 170]: CrossEntropyWithSoftmax = 0.70096191 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0143s; samplesPerSecond = 17465.4
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69437305 * 250; EvalClassificationError = 0.49600000 * 250; time = 0.0117s; samplesPerSecond = 21367.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 171- 180]: CrossEntropyWithSoftmax = 0.69437305 * 250; EvalErrorPrediction = 0.49600000 * 250; time = 0.0117s; samplesPerSecond = 21367.5
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69161621 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0137s; samplesPerSecond = 18200.3
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 181- 190]: CrossEntropyWithSoftmax = 0.69161621 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0137s; samplesPerSecond = 18200.3
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73388281 * 250; EvalClassificationError = 0.55200000 * 250; time = 0.0115s; samplesPerSecond = 21782.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 191- 200]: CrossEntropyWithSoftmax = 0.73388281 * 250; EvalErrorPrediction = 0.55200000 * 250; time = 0.0115s; samplesPerSecond = 21782.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72255664 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0127s; samplesPerSecond = 19745.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 201- 210]: CrossEntropyWithSoftmax = 0.72255664 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0127s; samplesPerSecond = 19745.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70414551 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0131s; samplesPerSecond = 19017.2
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 211- 220]: CrossEntropyWithSoftmax = 0.70414551 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0131s; samplesPerSecond = 19017.2
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69976758 * 250; EvalClassificationError = 0.46000000 * 250; time = 0.0137s; samplesPerSecond = 18191.1
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 221- 230]: CrossEntropyWithSoftmax = 0.69976758 * 250; EvalErrorPrediction = 0.46000000 * 250; time = 0.0137s; samplesPerSecond = 18191.1
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72419141 * 250; EvalClassificationError = 0.51600000 * 250; time = 0.0143s; samplesPerSecond = 17444.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 231- 240]: CrossEntropyWithSoftmax = 0.72419141 * 250; EvalErrorPrediction = 0.51600000 * 250; time = 0.0143s; samplesPerSecond = 17444.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69943945 * 250; EvalClassificationError = 0.51200000 * 250; time = 0.0109s; samplesPerSecond = 22891.7
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 241- 250]: CrossEntropyWithSoftmax = 0.69943945 * 250; EvalErrorPrediction = 0.51200000 * 250; time = 0.0109s; samplesPerSecond = 22891.7
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69206445 * 250; EvalClassificationError = 0.47600000 * 250; time = 0.0133s; samplesPerSecond = 18739.2
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 251- 260]: CrossEntropyWithSoftmax = 0.69206445 * 250; EvalErrorPrediction = 0.47600000 * 250; time = 0.0133s; samplesPerSecond = 18739.2
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68771680 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0130s; samplesPerSecond = 19291.6
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 261- 270]: CrossEntropyWithSoftmax = 0.68771680 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0130s; samplesPerSecond = 19291.6
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69878516 * 250; EvalClassificationError = 0.44000000 * 250; time = 0.0130s; samplesPerSecond = 19230.8
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 271- 280]: CrossEntropyWithSoftmax = 0.69878516 * 250; EvalErrorPrediction = 0.44000000 * 250; time = 0.0130s; samplesPerSecond = 19230.8
-05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71889844 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0118s; samplesPerSecond = 21168.5
+05/03/2016 13:01:59:  Epoch[ 1 of 3]-Minibatch[ 281- 290]: CrossEntropyWithSoftmax = 0.71889844 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0118s; samplesPerSecond = 21168.5
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.70086523 * 250; EvalClassificationError = 0.52400000 * 250; time = 0.0128s; samplesPerSecond = 19577.1
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 291- 300]: CrossEntropyWithSoftmax = 0.70086523 * 250; EvalErrorPrediction = 0.52400000 * 250; time = 0.0128s; samplesPerSecond = 19577.1
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70878320 * 250; EvalClassificationError = 0.53200000 * 250; time = 0.0129s; samplesPerSecond = 19432.6
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 301- 310]: CrossEntropyWithSoftmax = 0.70878320 * 250; EvalErrorPrediction = 0.53200000 * 250; time = 0.0129s; samplesPerSecond = 19432.6
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.70674414 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0126s; samplesPerSecond = 19767.5
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 311- 320]: CrossEntropyWithSoftmax = 0.70674414 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0126s; samplesPerSecond = 19767.5
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69707422 * 250; EvalClassificationError = 0.50000000 * 250; time = 0.0121s; samplesPerSecond = 20736.6
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 321- 330]: CrossEntropyWithSoftmax = 0.69707422 * 250; EvalErrorPrediction = 0.50000000 * 250; time = 0.0121s; samplesPerSecond = 20736.6
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68588281 * 250; EvalClassificationError = 0.40800000 * 250; time = 0.0124s; samplesPerSecond = 20109.4
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 331- 340]: CrossEntropyWithSoftmax = 0.68588281 * 250; EvalErrorPrediction = 0.40800000 * 250; time = 0.0124s; samplesPerSecond = 20109.4
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.67734766 * 250; EvalClassificationError = 0.45600000 * 250; time = 0.0127s; samplesPerSecond = 19727.0
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 341- 350]: CrossEntropyWithSoftmax = 0.67734766 * 250; EvalErrorPrediction = 0.45600000 * 250; time = 0.0127s; samplesPerSecond = 19727.0
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.67958008 * 250; EvalClassificationError = 0.48000000 * 250; time = 0.0127s; samplesPerSecond = 19615.5
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 351- 360]: CrossEntropyWithSoftmax = 0.67958008 * 250; EvalErrorPrediction = 0.48000000 * 250; time = 0.0127s; samplesPerSecond = 19615.5
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.66424805 * 250; EvalClassificationError = 0.46800000 * 250; time = 0.0117s; samplesPerSecond = 21292.9
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 361- 370]: CrossEntropyWithSoftmax = 0.66424805 * 250; EvalErrorPrediction = 0.46800000 * 250; time = 0.0117s; samplesPerSecond = 21292.9
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.62412500 * 250; EvalClassificationError = 0.20400000 * 250; time = 0.0127s; samplesPerSecond = 19624.8
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 371- 380]: CrossEntropyWithSoftmax = 0.62412500 * 250; EvalErrorPrediction = 0.20400000 * 250; time = 0.0127s; samplesPerSecond = 19624.8
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.58007422 * 250; EvalClassificationError = 0.16000000 * 250; time = 0.0130s; samplesPerSecond = 19157.1
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 381- 390]: CrossEntropyWithSoftmax = 0.58007422 * 250; EvalErrorPrediction = 0.16000000 * 250; time = 0.0130s; samplesPerSecond = 19157.1
-05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.52764648 * 250; EvalClassificationError = 0.19200000 * 250; time = 0.0143s; samplesPerSecond = 17521.7
+05/03/2016 13:02:00:  Epoch[ 1 of 3]-Minibatch[ 391- 400]: CrossEntropyWithSoftmax = 0.52764648 * 250; EvalErrorPrediction = 0.19200000 * 250; time = 0.0143s; samplesPerSecond = 17521.7
-05/03/2016 13:02:00: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.69975483 * 10000; EvalClassificationError = 0.46850000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.526194s
+05/03/2016 13:02:00: Finished Epoch[ 1 of 3]: [Training] CrossEntropyWithSoftmax = 0.69975483 * 10000; EvalErrorPrediction = 0.46850000 * 10000; totalSamplesSeen = 10000; learningRatePerSample = 0.02; epochTime=0.526194s
 05/03/2016 13:02:00: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503140157.802427\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_gpu/Models/simple.dnn.1'
 05/03/2016 13:02:00: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 13:02:00: Starting minibatch loop.
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.44832977 * 250; EvalClassificationError = 0.15200000 * 250; time = 0.0124s; samplesPerSecond = 20205.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.44832977 * 250; EvalErrorPrediction = 0.15200000 * 250; time = 0.0124s; samplesPerSecond = 20205.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.40085291 * 250; EvalClassificationError = 0.12400000 * 250; time = 0.0142s; samplesPerSecond = 17631.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.40085291 * 250; EvalErrorPrediction = 0.12400000 * 250; time = 0.0142s; samplesPerSecond = 17631.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.33487201 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0129s; samplesPerSecond = 19405.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.33487201 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0129s; samplesPerSecond = 19405.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.29081885 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0125s; samplesPerSecond = 20016.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.29081885 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0125s; samplesPerSecond = 20016.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.26279236 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0118s; samplesPerSecond = 21188.2
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.26279236 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0118s; samplesPerSecond = 21188.2
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.25220630 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0138s; samplesPerSecond = 18158.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.25220630 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0138s; samplesPerSecond = 18158.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.20988293 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0129s; samplesPerSecond = 19447.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.20988293 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0129s; samplesPerSecond = 19447.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.21577441 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0148s; samplesPerSecond = 16846.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.21577441 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0148s; samplesPerSecond = 16846.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.16622900 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0157s; samplesPerSecond = 15967.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.16622900 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0157s; samplesPerSecond = 15967.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.17637866 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0144s; samplesPerSecond = 17315.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.17637866 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0144s; samplesPerSecond = 17315.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.22185278 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0123s; samplesPerSecond = 20366.6
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.22185278 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0123s; samplesPerSecond = 20366.6
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17055811 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0151s; samplesPerSecond = 16564.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17055811 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0151s; samplesPerSecond = 16564.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16481055 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0140s; samplesPerSecond = 17910.9
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.16481055 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0140s; samplesPerSecond = 17910.9
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13871704 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0156s; samplesPerSecond = 16005.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.13871704 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0156s; samplesPerSecond = 16005.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16922363 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0143s; samplesPerSecond = 17454.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.16922363 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0143s; samplesPerSecond = 17454.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15403345 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0135s; samplesPerSecond = 18485.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.15403345 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0135s; samplesPerSecond = 18485.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22255859 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0108s; samplesPerSecond = 23079.8
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.22255859 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0108s; samplesPerSecond = 23079.8
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18146851 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0133s; samplesPerSecond = 18843.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.18146851 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0133s; samplesPerSecond = 18843.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15611523 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0131s; samplesPerSecond = 19081.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.15611523 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0131s; samplesPerSecond = 19081.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17320215 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0137s; samplesPerSecond = 18192.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.17320215 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0137s; samplesPerSecond = 18192.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15727930 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0117s; samplesPerSecond = 21404.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.15727930 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0117s; samplesPerSecond = 21404.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16195410 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0119s; samplesPerSecond = 21088.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.16195410 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0119s; samplesPerSecond = 21088.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.16121338 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0128s; samplesPerSecond = 19546.5
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.16121338 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0128s; samplesPerSecond = 19546.5
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15427100 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0125s; samplesPerSecond = 20011.2
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.15427100 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0125s; samplesPerSecond = 20011.2
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14844775 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0141s; samplesPerSecond = 17743.1
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.14844775 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0141s; samplesPerSecond = 17743.1
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15055713 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0108s; samplesPerSecond = 23067.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.15055713 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0108s; samplesPerSecond = 23067.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15467627 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0132s; samplesPerSecond = 18965.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.15467627 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0132s; samplesPerSecond = 18965.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17615869 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0140s; samplesPerSecond = 17872.5
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.17615869 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0140s; samplesPerSecond = 17872.5
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22356104 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0121s; samplesPerSecond = 20650.9
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.22356104 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0121s; samplesPerSecond = 20650.9
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16514209 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0109s; samplesPerSecond = 22946.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16514209 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0109s; samplesPerSecond = 22946.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17355859 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0129s; samplesPerSecond = 19372.3
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.17355859 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0129s; samplesPerSecond = 19372.3
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13117578 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0138s; samplesPerSecond = 18151.5
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.13117578 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0138s; samplesPerSecond = 18151.5
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13956104 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0121s; samplesPerSecond = 20743.4
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.13956104 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0121s; samplesPerSecond = 20743.4
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18397363 * 250; EvalClassificationError = 0.09600000 * 250; time = 0.0105s; samplesPerSecond = 23741.7
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.18397363 * 250; EvalErrorPrediction = 0.09600000 * 250; time = 0.0105s; samplesPerSecond = 23741.7
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15222656 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0126s; samplesPerSecond = 19909.2
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.15222656 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0126s; samplesPerSecond = 19909.2
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.18856396 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0145s; samplesPerSecond = 17207.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.18856396 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0145s; samplesPerSecond = 17207.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17513330 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0130s; samplesPerSecond = 19199.8
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.17513330 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0130s; samplesPerSecond = 19199.8
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15008252 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0108s; samplesPerSecond = 23043.6
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15008252 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0108s; samplesPerSecond = 23043.6
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12125342 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0127s; samplesPerSecond = 19668.0
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.12125342 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0127s; samplesPerSecond = 19668.0
-05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15408496 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0141s; samplesPerSecond = 17788.5
+05/03/2016 13:02:00:  Epoch[ 2 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.15408496 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0141s; samplesPerSecond = 17788.5
-05/03/2016 13:02:00: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.19333879 * 10000; EvalClassificationError = 0.07700000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.525411s
+05/03/2016 13:02:00: Finished Epoch[ 2 of 3]: [Training] CrossEntropyWithSoftmax = 0.19333879 * 10000; EvalErrorPrediction = 0.07700000 * 10000; totalSamplesSeen = 20000; learningRatePerSample = 0.0080000004; epochTime=0.525411s
 05/03/2016 13:02:00: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503140157.802427\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_gpu/Models/simple.dnn.2'
 05/03/2016 13:02:00: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000  momentum as time constant = 237.3 samples
 05/03/2016 13:02:00: Starting minibatch loop.
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10746781 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0126s; samplesPerSecond = 19806.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[   1-  10, 2.50%]: CrossEntropyWithSoftmax = 0.10746781 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0126s; samplesPerSecond = 19806.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17648278 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0122s; samplesPerSecond = 20429.8
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  11-  20, 5.00%]: CrossEntropyWithSoftmax = 0.17648278 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0122s; samplesPerSecond = 20429.8
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14106094 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0126s; samplesPerSecond = 19838.1
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  21-  30, 7.50%]: CrossEntropyWithSoftmax = 0.14106094 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0126s; samplesPerSecond = 19838.1
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16348077 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0127s; samplesPerSecond = 19745.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  31-  40, 10.00%]: CrossEntropyWithSoftmax = 0.16348077 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0127s; samplesPerSecond = 19745.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11767151 * 250; EvalClassificationError = 0.04000000 * 250; time = 0.0110s; samplesPerSecond = 22787.3
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  41-  50, 12.50%]: CrossEntropyWithSoftmax = 0.11767151 * 250; EvalErrorPrediction = 0.04000000 * 250; time = 0.0110s; samplesPerSecond = 22787.3
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16217944 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0137s; samplesPerSecond = 18292.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  51-  60, 15.00%]: CrossEntropyWithSoftmax = 0.16217944 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0137s; samplesPerSecond = 18292.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16171204 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0147s; samplesPerSecond = 16977.9
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  61-  70, 17.50%]: CrossEntropyWithSoftmax = 0.16171204 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0147s; samplesPerSecond = 16977.9
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19844067 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0130s; samplesPerSecond = 19285.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  71-  80, 20.00%]: CrossEntropyWithSoftmax = 0.19844067 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0130s; samplesPerSecond = 19285.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19984509 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0116s; samplesPerSecond = 21585.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  81-  90, 22.50%]: CrossEntropyWithSoftmax = 0.19984509 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0116s; samplesPerSecond = 21585.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13727051 * 250; EvalClassificationError = 0.05200000 * 250; time = 0.0133s; samplesPerSecond = 18839.5
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[  91- 100, 25.00%]: CrossEntropyWithSoftmax = 0.13727051 * 250; EvalErrorPrediction = 0.05200000 * 250; time = 0.0133s; samplesPerSecond = 18839.5
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20126648 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0150s; samplesPerSecond = 16709.0
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 101- 110, 27.50%]: CrossEntropyWithSoftmax = 0.20126648 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0150s; samplesPerSecond = 16709.0
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17913672 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0138s; samplesPerSecond = 18066.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 111- 120, 30.00%]: CrossEntropyWithSoftmax = 0.17913672 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0138s; samplesPerSecond = 18066.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15983582 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0138s; samplesPerSecond = 18131.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 121- 130, 32.50%]: CrossEntropyWithSoftmax = 0.15983582 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0138s; samplesPerSecond = 18131.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16260010 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0126s; samplesPerSecond = 19798.8
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 131- 140, 35.00%]: CrossEntropyWithSoftmax = 0.16260010 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0126s; samplesPerSecond = 19798.8
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19813428 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0122s; samplesPerSecond = 20453.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 141- 150, 37.50%]: CrossEntropyWithSoftmax = 0.19813428 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0122s; samplesPerSecond = 20453.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10295117 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0124s; samplesPerSecond = 20091.6
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 151- 160, 40.00%]: CrossEntropyWithSoftmax = 0.10295117 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0124s; samplesPerSecond = 20091.6
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17117065 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0127s; samplesPerSecond = 19762.8
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 161- 170, 42.50%]: CrossEntropyWithSoftmax = 0.17117065 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0127s; samplesPerSecond = 19762.8
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16661938 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0127s; samplesPerSecond = 19620.2
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 171- 180, 45.00%]: CrossEntropyWithSoftmax = 0.16661938 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0127s; samplesPerSecond = 19620.2
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12718042 * 250; EvalClassificationError = 0.05600000 * 250; time = 0.0108s; samplesPerSecond = 23156.7
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 181- 190, 47.50%]: CrossEntropyWithSoftmax = 0.12718042 * 250; EvalErrorPrediction = 0.05600000 * 250; time = 0.0108s; samplesPerSecond = 23156.7
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11923853 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0139s; samplesPerSecond = 17989.5
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 191- 200, 50.00%]: CrossEntropyWithSoftmax = 0.11923853 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0139s; samplesPerSecond = 17989.5
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12890332 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0129s; samplesPerSecond = 19340.9
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 201- 210, 52.50%]: CrossEntropyWithSoftmax = 0.12890332 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0129s; samplesPerSecond = 19340.9
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18205469 * 250; EvalClassificationError = 0.10000000 * 250; time = 0.0124s; samplesPerSecond = 20182.4
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 211- 220, 55.00%]: CrossEntropyWithSoftmax = 0.18205469 * 250; EvalErrorPrediction = 0.10000000 * 250; time = 0.0124s; samplesPerSecond = 20182.4
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13154199 * 250; EvalClassificationError = 0.06000000 * 250; time = 0.0111s; samplesPerSecond = 22599.9
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 221- 230, 57.50%]: CrossEntropyWithSoftmax = 0.13154199 * 250; EvalErrorPrediction = 0.06000000 * 250; time = 0.0111s; samplesPerSecond = 22599.9
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19668359 * 250; EvalClassificationError = 0.10400000 * 250; time = 0.0139s; samplesPerSecond = 17922.4
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 231- 240, 60.00%]: CrossEntropyWithSoftmax = 0.19668359 * 250; EvalErrorPrediction = 0.10400000 * 250; time = 0.0139s; samplesPerSecond = 17922.4
-05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15817578 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0126s; samplesPerSecond = 19915.6
+05/03/2016 13:02:00:  Epoch[ 3 of 3]-Minibatch[ 241- 250, 62.50%]: CrossEntropyWithSoftmax = 0.15817578 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0126s; samplesPerSecond = 19915.6
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11871240 * 250; EvalClassificationError = 0.04400000 * 250; time = 0.0136s; samplesPerSecond = 18378.3
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 251- 260, 65.00%]: CrossEntropyWithSoftmax = 0.11871240 * 250; EvalErrorPrediction = 0.04400000 * 250; time = 0.0136s; samplesPerSecond = 18378.3
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13730908 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0107s; samplesPerSecond = 23384.2
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 261- 270, 67.50%]: CrossEntropyWithSoftmax = 0.13730908 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0107s; samplesPerSecond = 23384.2
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20024854 * 250; EvalClassificationError = 0.09200000 * 250; time = 0.0134s; samplesPerSecond = 18719.6
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 271- 280, 70.00%]: CrossEntropyWithSoftmax = 0.20024854 * 250; EvalErrorPrediction = 0.09200000 * 250; time = 0.0134s; samplesPerSecond = 18719.6
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.18850244 * 250; EvalClassificationError = 0.10800000 * 250; time = 0.0131s; samplesPerSecond = 19151.2
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 281- 290, 72.50%]: CrossEntropyWithSoftmax = 0.18850244 * 250; EvalErrorPrediction = 0.10800000 * 250; time = 0.0131s; samplesPerSecond = 19151.2
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16640479 * 250; EvalClassificationError = 0.07200000 * 250; time = 0.0108s; samplesPerSecond = 23086.2
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 291- 300, 75.00%]: CrossEntropyWithSoftmax = 0.16640479 * 250; EvalErrorPrediction = 0.07200000 * 250; time = 0.0108s; samplesPerSecond = 23086.2
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.11872168 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0107s; samplesPerSecond = 23347.0
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 301- 310, 77.50%]: CrossEntropyWithSoftmax = 0.11872168 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0107s; samplesPerSecond = 23347.0
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16090430 * 250; EvalClassificationError = 0.08800000 * 250; time = 0.0127s; samplesPerSecond = 19730.1
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 311- 320, 80.00%]: CrossEntropyWithSoftmax = 0.16090430 * 250; EvalErrorPrediction = 0.08800000 * 250; time = 0.0127s; samplesPerSecond = 19730.1
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.16162939 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0137s; samplesPerSecond = 18205.7
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 321- 330, 82.50%]: CrossEntropyWithSoftmax = 0.16162939 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0137s; samplesPerSecond = 18205.7
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12408594 * 250; EvalClassificationError = 0.04800000 * 250; time = 0.0109s; samplesPerSecond = 22839.4
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 331- 340, 85.00%]: CrossEntropyWithSoftmax = 0.12408594 * 250; EvalErrorPrediction = 0.04800000 * 250; time = 0.0109s; samplesPerSecond = 22839.4
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13544434 * 250; EvalClassificationError = 0.06800000 * 250; time = 0.0126s; samplesPerSecond = 19893.4
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 341- 350, 87.50%]: CrossEntropyWithSoftmax = 0.13544434 * 250; EvalErrorPrediction = 0.06800000 * 250; time = 0.0126s; samplesPerSecond = 19893.4
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20890771 * 250; EvalClassificationError = 0.11200000 * 250; time = 0.0129s; samplesPerSecond = 19366.3
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 351- 360, 90.00%]: CrossEntropyWithSoftmax = 0.20890771 * 250; EvalErrorPrediction = 0.11200000 * 250; time = 0.0129s; samplesPerSecond = 19366.3
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16674365 * 250; EvalClassificationError = 0.08400000 * 250; time = 0.0146s; samplesPerSecond = 17116.3
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 361- 370, 92.50%]: CrossEntropyWithSoftmax = 0.16674365 * 250; EvalErrorPrediction = 0.08400000 * 250; time = 0.0146s; samplesPerSecond = 17116.3
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15033398 * 250; EvalClassificationError = 0.06400000 * 250; time = 0.0131s; samplesPerSecond = 19152.7
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 371- 380, 95.00%]: CrossEntropyWithSoftmax = 0.15033398 * 250; EvalErrorPrediction = 0.06400000 * 250; time = 0.0131s; samplesPerSecond = 19152.7
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16547705 * 250; EvalClassificationError = 0.07600000 * 250; time = 0.0120s; samplesPerSecond = 20752.1
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 381- 390, 97.50%]: CrossEntropyWithSoftmax = 0.16547705 * 250; EvalErrorPrediction = 0.07600000 * 250; time = 0.0120s; samplesPerSecond = 20752.1
-05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16792480 * 250; EvalClassificationError = 0.08000000 * 250; time = 0.0129s; samplesPerSecond = 19450.7
+05/03/2016 13:02:01:  Epoch[ 3 of 3]-Minibatch[ 391- 400, 100.00%]: CrossEntropyWithSoftmax = 0.16792480 * 250; EvalErrorPrediction = 0.08000000 * 250; time = 0.0129s; samplesPerSecond = 19450.7
-05/03/2016 13:02:01: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15806136 * 10000; EvalClassificationError = 0.07470000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.511151s
+05/03/2016 13:02:01: Finished Epoch[ 3 of 3]: [Training] CrossEntropyWithSoftmax = 0.15806136 * 10000; EvalErrorPrediction = 0.07470000 * 10000; totalSamplesSeen = 30000; learningRatePerSample = 0.0080000004; epochTime=0.511151s
 05/03/2016 13:02:01: SGD: Saving checkpoint model 'E:\cygwin64\tmp\cntk-test-20160503140157.802427\CNTKTextFormatReader\Examples\Other\Simple2d_Simple@release_gpu/Models/simple.dnn'
 05/03/2016 13:02:01: CNTKCommandTrainEnd: Simple_Demo_Train
@ -607,7 +607,7 @@ Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -636,7 +636,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *1] -> [2 x 1 x *1]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *1], [2 x 1] -> [2 x 1 x *1]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *1], [2 x 1 x *1] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *1] -> [2 x 1 x *1]
 Validating --> Prior = Mean (labels) : [2 x *1] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -660,7 +660,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalClassificationError Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [EvalErrorPrediction Gradient[1]] [H1 Gradient[50 x 1 x *1]] [H2 Gradient[50 x 1 x *1]] [HLast Gradient[2 x 1 x *1]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *1]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *1]] [PosteriorProb Value[2 x 1 x *1]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *1]] [ScaledLogLikelihood Value[2 x 1 x *1]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *1]] [W0*features+B0 Gradient[50 x 1 x *1]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *1]] [W1*H1+B1 Gradient[50 x 1 x *1]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *1]] [features Gradient[2 x *1]] [labels Gradient[2 x *1]] }
 000000501A591090: {[W0*features+B0 Value[50 x 1 x *1]] }
 000000501A591130: {[W1*H1 Value[50 x 1 x *1]] }
 000000501A5916D0: {[W1*H1+B1 Value[50 x 1 x *1]] }
@ -672,7 +672,7 @@ Memory Sharing Structure:
 000000501A592850: {[LogOfPrior Value[2]] }
 000000501A5928F0: {[H2 Value[50 x 1 x *1]] }
 000000501A592B70: {[W2 Value[2 x 50]] }
-000000501A592D50: {[EvalClassificationError Value[1]] }
+000000501A592D50: {[EvalErrorPrediction Value[1]] }
 000000501A592DF0: {[CrossEntropyWithSoftmax Value[1]] }
 0000005024E60C70: {[W1 Value[50 x 50]] }
 0000005024E613F0: {[W0 Value[50 x 2]] }
@ -685,7 +685,7 @@ Memory Sharing Structure:
 0000005024E62430: {[features Value[2 x *1]] }
 0000005024E624D0: {[B1 Value[50 x 1]] }
-05/03/2016 13:02:01: Final Results: Minibatch[1-1]: EvalClassificationError = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12740351 * 603; perplexity = 1.13587526
+05/03/2016 13:02:01: Final Results: Minibatch[1-1]: EvalErrorPrediction = 0.05638474 * 603; CrossEntropyWithSoftmax = 0.12740351 * 603; perplexity = 1.13587526
 05/03/2016 13:02:01: Action "test" complete.
@ -701,7 +701,7 @@ Post-processing network...
 8 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -731,7 +731,7 @@ Validating --> W2*H1 = Times (W2, H2) : [2 x 50], [50 x 1 x *2] -> [2 x 1 x *2]
 Validating --> B2 = LearnableParameter() :  -> [2 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [2 x 1 x *2], [2 x 1] -> [2 x 1 x *2]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [2 x *2], [2 x 1 x *2] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [2 x 1 x *2] -> [2 x 1 x *2]
 Validating --> Prior = Mean (labels) : [2 x *2] -> [2]
 Validating --> LogOfPrior = Log (Prior) : [2] -> [2]
@ -754,7 +754,7 @@ Allocating matrices for forward and/or backward propagation.
 Memory Sharing Structure:
-0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalClassificationError Gradient[1]] [EvalClassificationError Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
+0000000000000000: {[B0 Gradient[50 x 1]] [B1 Gradient[50 x 1]] [B2 Gradient[2 x 1]] [CrossEntropyWithSoftmax Gradient[1]] [CrossEntropyWithSoftmax Value[1]] [EvalErrorPrediction Gradient[1]] [EvalErrorPrediction Value[1]] [H1 Gradient[50 x 1 x *2]] [H2 Gradient[50 x 1 x *2]] [HLast Gradient[2 x 1 x *2]] [InvStdOfFeatures Gradient[2]] [LogOfPrior Gradient[2]] [MVNormalizedFeatures Gradient[2 x *2]] [MeanOfFeatures Gradient[2]] [PosteriorProb Gradient[2 x 1 x *2]] [Prior Gradient[2]] [ScaledLogLikelihood Gradient[2 x 1 x *2]] [ScaledLogLikelihood Value[2 x 1 x *2]] [W0 Gradient[50 x 2]] [W0*features Gradient[50 x *2]] [W0*features+B0 Gradient[50 x 1 x *2]] [W1 Gradient[50 x 50]] [W1*H1 Gradient[50 x 1 x *2]] [W1*H1+B1 Gradient[50 x 1 x *2]] [W2 Gradient[2 x 50]] [W2*H1 Gradient[2 x 1 x *2]] [features Gradient[2 x *2]] [labels Gradient[2 x *2]] }
 000000501A5914F0: {[InvStdOfFeatures Value[2]] }
 000000501A591590: {[MeanOfFeatures Value[2]] }
 000000501A5916D0: {[labels Value[2 x *2]] }
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.cpu.txt
@ -0,0 +1,434 @@
 CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 264172964 kB
 -------------------------------------------------------------------
 === Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config/FeedForward.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]
 -------------------------------------------------------------------
 Build info: 
 		Built time: Aug 16 2016 09:41:57
 		Last modified date: Mon Aug 15 23:39:17 2016
 		Build type: release
 		Build target: GPU
 		With 1bit-SGD: yes
 		Math lib: mkl
 		CUDA_PATH: /usr/local/cuda-7.5
 		CUB_PATH: /usr/local/cub-1.4.1
 		CUDNN_PATH: /usr/local/cudnn-4.0
 		Build Branch: HEAD
 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 		Built by philly on 643085f7f8c2
 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 -------------------------------------------------------------------
 Changed current directory to /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 MPIWrapper: initializing MPI
 ping [requestnodes (before change)]: 1 nodes pinging each other
 ping [requestnodes (before change)]: all 1 nodes responded
 requestnodes [MPIWrapper]: using 1 out of 1 MPI nodes (1 requested); we (0) are in (participating)
 ping [requestnodes (after change)]: 1 nodes pinging each other
 ping [requestnodes (after change)]: all 1 nodes responded
 mpihelper: only one MPI process: MPI operation will be boring
 ping [mpihelper]: 1 nodes pinging each other
 ping [mpihelper]: all 1 nodes responded
 08/16/2016 10:01:41: -------------------------------------------------------------------
 08/16/2016 10:01:41: Build info: 
 08/16/2016 10:01:41: 		Built time: Aug 16 2016 09:41:57
 08/16/2016 10:01:41: 		Last modified date: Mon Aug 15 23:39:17 2016
 08/16/2016 10:01:41: 		Build type: release
 08/16/2016 10:01:41: 		Build target: GPU
 08/16/2016 10:01:41: 		With 1bit-SGD: yes
 08/16/2016 10:01:41: 		Math lib: mkl
 08/16/2016 10:01:41: 		CUDA_PATH: /usr/local/cuda-7.5
 08/16/2016 10:01:41: 		CUB_PATH: /usr/local/cub-1.4.1
 08/16/2016 10:01:41: 		CUDNN_PATH: /usr/local/cudnn-4.0
 08/16/2016 10:01:41: 		Build Branch: HEAD
 08/16/2016 10:01:41: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 08/16/2016 10:01:41: 		Built by philly on 643085f7f8c2
 08/16/2016 10:01:41: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 08/16/2016 10:01:41: -------------------------------------------------------------------
 08/16/2016 10:01:42: -------------------------------------------------------------------
 08/16/2016 10:01:42: GPU info:
 08/16/2016 10:01:42: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:42: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:42: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:42: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:42: -------------------------------------------------------------------
 08/16/2016 10:01:42: Running on localhost at 2016/08/16 10:01:42
 08/16/2016 10:01:42: Command line: 
 /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config/FeedForward.cntk  currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu  DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config  OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]  speechTrain=[reader=[useMersenneTwisterRand=true]]
 08/16/2016 10:01:42: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 08/16/2016 10:01:42: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
 ModelDir = "$OutputDir$/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = "1"
 modelPath = "$ModelDir$/cntkSpeechFF.dnn"
 parallelTrain = true
 speechTrain = [
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
    ]
    SGD = [
        epochSize = 20480
        minibatchSize = 256:1024:2048
        learningRatesPerMB = 1.0:0.5:0.1
        numMBsToShowResult = 10
        momentumPerMB = 0.9:0.656119
        maxEpochs = 3
        keepCheckPointFiles = true
        parallelTrain = [
            parallelizationMethod = "DataParallelSGD"
            distributedMBReading = true
            dataParallelSGD = [
                gradientBits = 1
            ]
        ]
        autoAdjust=[
            autoAdjustMinibatch = true
            minibatchSizeTuningFrequency = 1
            minibatchSearchCriterionErrorMargin = 2
        ]
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [ 
            dim = 363
            type = "real"
            scpFile = "$DataDir$/glob_0000.scp"
        ]
        labels = [
            mlfFile = "$DataDir$/glob_0000.mlf"
            labelMappingFile = "$DataDir$/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
 DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
 OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 08/16/2016 10:01:42: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:42: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 08/16/2016 10:01:42: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
 ModelDir = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = "1"
 modelPath = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn"
 parallelTrain = true
 speechTrain = [
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
    ]
    SGD = [
        epochSize = 20480
        minibatchSize = 256:1024:2048
        learningRatesPerMB = 1.0:0.5:0.1
        numMBsToShowResult = 10
        momentumPerMB = 0.9:0.656119
        maxEpochs = 3
        keepCheckPointFiles = true
        parallelTrain = [
            parallelizationMethod = "DataParallelSGD"
            distributedMBReading = true
            dataParallelSGD = [
                gradientBits = 1
            ]
        ]
        autoAdjust=[
            autoAdjustMinibatch = true
            minibatchSizeTuningFrequency = 1
            minibatchSearchCriterionErrorMargin = 2
        ]
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [ 
            dim = 363
            type = "real"
            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
 DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
 OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 08/16/2016 10:01:42: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:42: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: FeedForward.cntk:command=speechTrain
 configparameters: FeedForward.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
 configparameters: FeedForward.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 configparameters: FeedForward.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 configparameters: FeedForward.cntk:deviceId=-1
 configparameters: FeedForward.cntk:ModelDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models
 configparameters: FeedForward.cntk:modelPath=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
 configparameters: FeedForward.cntk:OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
 configparameters: FeedForward.cntk:parallelTrain=true
 configparameters: FeedForward.cntk:precision=float
 configparameters: FeedForward.cntk:RootDir=..
 configparameters: FeedForward.cntk:RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu
 configparameters: FeedForward.cntk:speechTrain=[
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
    ]
    SGD = [
        epochSize = 20480
        minibatchSize = 256:1024:2048
        learningRatesPerMB = 1.0:0.5:0.1
        numMBsToShowResult = 10
        momentumPerMB = 0.9:0.656119
        maxEpochs = 3
        keepCheckPointFiles = true
        parallelTrain = [
            parallelizationMethod = "DataParallelSGD"
            distributedMBReading = true
            dataParallelSGD = [
                gradientBits = 1
            ]
        ]
        autoAdjust=[
            autoAdjustMinibatch = true
            minibatchSizeTuningFrequency = 1
            minibatchSearchCriterionErrorMargin = 2
        ]
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [ 
            dim = 363
            type = "real"
            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]] [reader=[useMersenneTwisterRand=true]]
 configparameters: FeedForward.cntk:timestamping=true
 configparameters: FeedForward.cntk:traceLevel=1
 08/16/2016 10:01:42: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:42: Commands: speechTrain
 08/16/2016 10:01:42: Precision = "float"
 08/16/2016 10:01:42: CNTKModelPath: /tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
 08/16/2016 10:01:42: CNTKCommandTrainInfo: speechTrain : 1
 08/16/2016 10:01:42: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
 08/16/2016 10:01:42: ##############################################################################
 08/16/2016 10:01:42: #                                                                            #
 08/16/2016 10:01:42: # Action "train"                                                             #
 08/16/2016 10:01:42: #                                                                            #
 08/16/2016 10:01:42: ##############################################################################
 08/16/2016 10:01:42: CNTKCommandTrainBegin: speechTrain
 SimpleNetworkBuilder Using CPU
 reading script file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp ... 948 entries
 total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list
 htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
 08/16/2016 10:01:42: Creating virgin network.
 Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- 0.000000.
 Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- 0.000000.
 Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
 Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- 0.000000.
 Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
 Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
 	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
 	Prior = Mean()
 	ScaledLogLikelihood = Minus()
 Validating network. 25 nodes to process in pass 1.
 Validating --> labels = InputValue() :  -> [132 x *]
 Validating --> W2 = LearnableParameter() :  -> [132 x 512]
 Validating --> W1 = LearnableParameter() :  -> [512 x 512]
 Validating --> W0 = LearnableParameter() :  -> [512 x 363]
 Validating --> features = InputValue() :  -> [363 x *]
 Validating --> MeanOfFeatures = Mean (features) : [363 x *] -> [363]
 Validating --> InvStdOfFeatures = InvStdDev (features) : [363 x *] -> [363]
 Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization (features, MeanOfFeatures, InvStdOfFeatures) : [363 x *], [363], [363] -> [363 x *]
 Validating --> W0*features = Times (W0, MVNormalizedFeatures) : [512 x 363], [363 x *] -> [512 x *]
 Validating --> B0 = LearnableParameter() :  -> [512 x 1]
 Validating --> W0*features+B0 = Plus (W0*features, B0) : [512 x *], [512 x 1] -> [512 x 1 x *]
 Validating --> H1 = Sigmoid (W0*features+B0) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> W1*H1 = Times (W1, H1) : [512 x 512], [512 x 1 x *] -> [512 x 1 x *]
 Validating --> B1 = LearnableParameter() :  -> [512 x 1]
 Validating --> W1*H1+B1 = Plus (W1*H1, B1) : [512 x 1 x *], [512 x 1] -> [512 x 1 x *]
 Validating --> H2 = Sigmoid (W1*H1+B1) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> W2*H1 = Times (W2, H2) : [132 x 512], [512 x 1 x *] -> [132 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [132 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 x *] -> [132 x 1 x *]
 Validating --> Prior = Mean (labels) : [132 x *] -> [132]
 Validating --> LogOfPrior = Log (Prior) : [132] -> [132]
 Validating --> ScaledLogLikelihood = Minus (HLast, LogOfPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
 Validating network. 17 nodes to process in pass 2.
 Validating network, final pass.
 12 out of 25 nodes do not share the minibatch layout with the input data.
 Post-processing network complete.
 08/16/2016 10:01:42: Created model with 25 nodes on CPU.
 08/16/2016 10:01:42: Training criterion node(s):
 08/16/2016 10:01:42: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
 08/16/2016 10:01:42: Evaluation criterion node(s):
 08/16/2016 10:01:42: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing: Out of 40 matrices, 19 are shared as 8, and 21 are not shared.
 	{ W1 : [512 x 512] (gradient)
 	  W1*H1+B1 : [512 x 1 x *] }
 	{ H2 : [512 x 1 x *]
 	  W1*H1 : [512 x 1 x *] (gradient) }
 	{ B0 : [512 x 1] (gradient)
 	  H1 : [512 x 1 x *] (gradient)
 	  W1*H1+B1 : [512 x 1 x *] (gradient)
 	  W2*H1 : [132 x 1 x *] }
 	{ HLast : [132 x 1 x *]
 	  W2 : [132 x 512] (gradient) }
 	{ B1 : [512 x 1] (gradient)
 	  H2 : [512 x 1 x *] (gradient)
 	  HLast : [132 x 1 x *] (gradient) }
 	{ W0 : [512 x 363] (gradient)
 	  W0*features+B0 : [512 x 1 x *] }
 	{ H1 : [512 x 1 x *]
 	  W0*features : [512 x *] (gradient) }
 	{ W0*features+B0 : [512 x 1 x *] (gradient)
 	  W1*H1 : [512 x 1 x *] }
 08/16/2016 10:01:42: Training 516740 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:
 08/16/2016 10:01:42: 	Node 'B0' (LearnableParameter operation) : [512 x 1]
 08/16/2016 10:01:42: 	Node 'B1' (LearnableParameter operation) : [512 x 1]
 08/16/2016 10:01:42: 	Node 'B2' (LearnableParameter operation) : [132 x 1]
 08/16/2016 10:01:42: 	Node 'W0' (LearnableParameter operation) : [512 x 363]
 08/16/2016 10:01:42: 	Node 'W1' (LearnableParameter operation) : [512 x 512]
 08/16/2016 10:01:42: 	Node 'W2' (LearnableParameter operation) : [132 x 512]
 08/16/2016 10:01:42: Precomputing --> 3 PreCompute nodes found.
 08/16/2016 10:01:42: 	MeanOfFeatures = Mean()
 08/16/2016 10:01:42: 	InvStdOfFeatures = InvStdDev()
 08/16/2016 10:01:42: 	Prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 08/16/2016 10:01:43: Precomputing --> Completed.
 08/16/2016 10:01:43: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
 minibatchiterator: epoch 0: frames [0..2048] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 08/16/2016 10:01:43: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
 08/16/2016 10:01:44: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.45117986 * 2048; EvalErrorPrediction = 0.92187500 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.209966s
 08/16/2016 10:01:44: SGD: Saving checkpoint model '/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn'
 08/16/2016 10:01:44: CNTKCommandTrainEnd: speechTrain
 08/16/2016 10:01:44: Action "train" complete.
 08/16/2016 10:01:44: __COMPLETED__
 ~MPIWrapper
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.debug.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.debug.cpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.debug.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.debug.gpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.gpu.txt
@ -0,0 +1,435 @@
 CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 264172964 kB
 -------------------------------------------------------------------
 === Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config/FeedForward.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]
 -------------------------------------------------------------------
 Build info: 
 		Built time: Aug 16 2016 09:41:57
 		Last modified date: Mon Aug 15 23:39:17 2016
 		Build type: release
 		Build target: GPU
 		With 1bit-SGD: yes
 		Math lib: mkl
 		CUDA_PATH: /usr/local/cuda-7.5
 		CUB_PATH: /usr/local/cub-1.4.1
 		CUDNN_PATH: /usr/local/cudnn-4.0
 		Build Branch: HEAD
 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 		Built by philly on 643085f7f8c2
 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 -------------------------------------------------------------------
 Changed current directory to /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 MPIWrapper: initializing MPI
 ping [requestnodes (before change)]: 1 nodes pinging each other
 ping [requestnodes (before change)]: all 1 nodes responded
 requestnodes [MPIWrapper]: using 1 out of 1 MPI nodes (1 requested); we (0) are in (participating)
 ping [requestnodes (after change)]: 1 nodes pinging each other
 ping [requestnodes (after change)]: all 1 nodes responded
 mpihelper: only one MPI process: MPI operation will be boring
 ping [mpihelper]: 1 nodes pinging each other
 ping [mpihelper]: all 1 nodes responded
 08/16/2016 10:01:45: -------------------------------------------------------------------
 08/16/2016 10:01:45: Build info: 
 08/16/2016 10:01:45: 		Built time: Aug 16 2016 09:41:57
 08/16/2016 10:01:45: 		Last modified date: Mon Aug 15 23:39:17 2016
 08/16/2016 10:01:45: 		Build type: release
 08/16/2016 10:01:45: 		Build target: GPU
 08/16/2016 10:01:45: 		With 1bit-SGD: yes
 08/16/2016 10:01:45: 		Math lib: mkl
 08/16/2016 10:01:45: 		CUDA_PATH: /usr/local/cuda-7.5
 08/16/2016 10:01:45: 		CUB_PATH: /usr/local/cub-1.4.1
 08/16/2016 10:01:45: 		CUDNN_PATH: /usr/local/cudnn-4.0
 08/16/2016 10:01:45: 		Build Branch: HEAD
 08/16/2016 10:01:45: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 08/16/2016 10:01:45: 		Built by philly on 643085f7f8c2
 08/16/2016 10:01:45: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 08/16/2016 10:01:45: -------------------------------------------------------------------
 08/16/2016 10:01:46: -------------------------------------------------------------------
 08/16/2016 10:01:46: GPU info:
 08/16/2016 10:01:46: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:46: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:46: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:46: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:46: -------------------------------------------------------------------
 08/16/2016 10:01:46: Running on localhost at 2016/08/16 10:01:46
 08/16/2016 10:01:46: Command line: 
 /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config/FeedForward.cntk  currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu  DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config  OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]  speechTrain=[reader=[useMersenneTwisterRand=true]]
 08/16/2016 10:01:46: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 08/16/2016 10:01:46: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
 ModelDir = "$OutputDir$/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = "1"
 modelPath = "$ModelDir$/cntkSpeechFF.dnn"
 parallelTrain = true
 speechTrain = [
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
    ]
    SGD = [
        epochSize = 20480
        minibatchSize = 256:1024:2048
        learningRatesPerMB = 1.0:0.5:0.1
        numMBsToShowResult = 10
        momentumPerMB = 0.9:0.656119
        maxEpochs = 3
        keepCheckPointFiles = true
        parallelTrain = [
            parallelizationMethod = "DataParallelSGD"
            distributedMBReading = true
            dataParallelSGD = [
                gradientBits = 1
            ]
        ]
        autoAdjust=[
            autoAdjustMinibatch = true
            minibatchSizeTuningFrequency = 1
            minibatchSearchCriterionErrorMargin = 2
        ]
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [ 
            dim = 363
            type = "real"
            scpFile = "$DataDir$/glob_0000.scp"
        ]
        labels = [
            mlfFile = "$DataDir$/glob_0000.mlf"
            labelMappingFile = "$DataDir$/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
 DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
 OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 08/16/2016 10:01:46: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:46: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 08/16/2016 10:01:46: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
 ModelDir = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = "1"
 modelPath = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn"
 parallelTrain = true
 speechTrain = [
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
    ]
    SGD = [
        epochSize = 20480
        minibatchSize = 256:1024:2048
        learningRatesPerMB = 1.0:0.5:0.1
        numMBsToShowResult = 10
        momentumPerMB = 0.9:0.656119
        maxEpochs = 3
        keepCheckPointFiles = true
        parallelTrain = [
            parallelizationMethod = "DataParallelSGD"
            distributedMBReading = true
            dataParallelSGD = [
                gradientBits = 1
            ]
        ]
        autoAdjust=[
            autoAdjustMinibatch = true
            minibatchSizeTuningFrequency = 1
            minibatchSearchCriterionErrorMargin = 2
        ]
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [ 
            dim = 363
            type = "real"
            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
 DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
 OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 08/16/2016 10:01:46: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:46: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: FeedForward.cntk:command=speechTrain
 configparameters: FeedForward.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/../../../../../../Examples/Speech/AN4/Config
 configparameters: FeedForward.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 configparameters: FeedForward.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 configparameters: FeedForward.cntk:deviceId=0
 configparameters: FeedForward.cntk:ModelDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models
 configparameters: FeedForward.cntk:modelPath=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
 configparameters: FeedForward.cntk:OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
 configparameters: FeedForward.cntk:parallelTrain=true
 configparameters: FeedForward.cntk:precision=float
 configparameters: FeedForward.cntk:RootDir=..
 configparameters: FeedForward.cntk:RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu
 configparameters: FeedForward.cntk:speechTrain=[
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
    ]
    SGD = [
        epochSize = 20480
        minibatchSize = 256:1024:2048
        learningRatesPerMB = 1.0:0.5:0.1
        numMBsToShowResult = 10
        momentumPerMB = 0.9:0.656119
        maxEpochs = 3
        keepCheckPointFiles = true
        parallelTrain = [
            parallelizationMethod = "DataParallelSGD"
            distributedMBReading = true
            dataParallelSGD = [
                gradientBits = 1
            ]
        ]
        autoAdjust=[
            autoAdjustMinibatch = true
            minibatchSizeTuningFrequency = 1
            minibatchSearchCriterionErrorMargin = 2
        ]
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [ 
            dim = 363
            type = "real"
            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]] [reader=[useMersenneTwisterRand=true]]
 configparameters: FeedForward.cntk:timestamping=true
 configparameters: FeedForward.cntk:traceLevel=1
 08/16/2016 10:01:46: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:46: Commands: speechTrain
 08/16/2016 10:01:46: Precision = "float"
 08/16/2016 10:01:46: CNTKModelPath: /tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
 08/16/2016 10:01:46: CNTKCommandTrainInfo: speechTrain : 1
 08/16/2016 10:01:46: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
 08/16/2016 10:01:46: ##############################################################################
 08/16/2016 10:01:46: #                                                                            #
 08/16/2016 10:01:46: # Action "train"                                                             #
 08/16/2016 10:01:46: #                                                                            #
 08/16/2016 10:01:46: ##############################################################################
 08/16/2016 10:01:46: CNTKCommandTrainBegin: speechTrain
 SimpleNetworkBuilder Using GPU 0
 reading script file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp ... 948 entries
 total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list
 htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
 08/16/2016 10:01:46: Creating virgin network.
 Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- 0.000000.
 Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
 Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- 0.000000.
 Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
 Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- 0.000000.
 Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
 Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
 	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
 	Prior = Mean()
 	ScaledLogLikelihood = Minus()
 Validating network. 25 nodes to process in pass 1.
 Validating --> labels = InputValue() :  -> [132 x *]
 Validating --> W2 = LearnableParameter() :  -> [132 x 512]
 Validating --> W1 = LearnableParameter() :  -> [512 x 512]
 Validating --> W0 = LearnableParameter() :  -> [512 x 363]
 Validating --> features = InputValue() :  -> [363 x *]
 Validating --> MeanOfFeatures = Mean (features) : [363 x *] -> [363]
 Validating --> InvStdOfFeatures = InvStdDev (features) : [363 x *] -> [363]
 Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization (features, MeanOfFeatures, InvStdOfFeatures) : [363 x *], [363], [363] -> [363 x *]
 Validating --> W0*features = Times (W0, MVNormalizedFeatures) : [512 x 363], [363 x *] -> [512 x *]
 Validating --> B0 = LearnableParameter() :  -> [512 x 1]
 Validating --> W0*features+B0 = Plus (W0*features, B0) : [512 x *], [512 x 1] -> [512 x 1 x *]
 Validating --> H1 = Sigmoid (W0*features+B0) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> W1*H1 = Times (W1, H1) : [512 x 512], [512 x 1 x *] -> [512 x 1 x *]
 Validating --> B1 = LearnableParameter() :  -> [512 x 1]
 Validating --> W1*H1+B1 = Plus (W1*H1, B1) : [512 x 1 x *], [512 x 1] -> [512 x 1 x *]
 Validating --> H2 = Sigmoid (W1*H1+B1) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> W2*H1 = Times (W2, H2) : [132 x 512], [512 x 1 x *] -> [132 x 1 x *]
 Validating --> B2 = LearnableParameter() :  -> [132 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 x *] -> [132 x 1 x *]
 Validating --> Prior = Mean (labels) : [132 x *] -> [132]
 Validating --> LogOfPrior = Log (Prior) : [132] -> [132]
 Validating --> ScaledLogLikelihood = Minus (HLast, LogOfPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
 Validating network. 17 nodes to process in pass 2.
 Validating network, final pass.
 12 out of 25 nodes do not share the minibatch layout with the input data.
 Post-processing network complete.
 08/16/2016 10:01:46: Created model with 25 nodes on GPU 0.
 08/16/2016 10:01:46: Training criterion node(s):
 08/16/2016 10:01:46: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
 08/16/2016 10:01:46: Evaluation criterion node(s):
 08/16/2016 10:01:46: 	EvalErrorPrediction = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing: Out of 40 matrices, 19 are shared as 8, and 21 are not shared.
 	{ W0 : [512 x 363] (gradient)
 	  W0*features+B0 : [512 x 1 x *] }
 	{ H1 : [512 x 1 x *]
 	  W0*features : [512 x *] (gradient) }
 	{ W0*features+B0 : [512 x 1 x *] (gradient)
 	  W1*H1 : [512 x 1 x *] }
 	{ W1 : [512 x 512] (gradient)
 	  W1*H1+B1 : [512 x 1 x *] }
 	{ H2 : [512 x 1 x *]
 	  W1*H1 : [512 x 1 x *] (gradient) }
 	{ B0 : [512 x 1] (gradient)
 	  H1 : [512 x 1 x *] (gradient)
 	  W1*H1+B1 : [512 x 1 x *] (gradient)
 	  W2*H1 : [132 x 1 x *] }
 	{ HLast : [132 x 1 x *]
 	  W2 : [132 x 512] (gradient) }
 	{ B1 : [512 x 1] (gradient)
 	  H2 : [512 x 1 x *] (gradient)
 	  HLast : [132 x 1 x *] (gradient) }
 08/16/2016 10:01:46: Training 516740 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:
 08/16/2016 10:01:46: 	Node 'B0' (LearnableParameter operation) : [512 x 1]
 08/16/2016 10:01:46: 	Node 'B1' (LearnableParameter operation) : [512 x 1]
 08/16/2016 10:01:46: 	Node 'B2' (LearnableParameter operation) : [132 x 1]
 08/16/2016 10:01:46: 	Node 'W0' (LearnableParameter operation) : [512 x 363]
 08/16/2016 10:01:46: 	Node 'W1' (LearnableParameter operation) : [512 x 512]
 08/16/2016 10:01:46: 	Node 'W2' (LearnableParameter operation) : [132 x 512]
 08/16/2016 10:01:46: Precomputing --> 3 PreCompute nodes found.
 08/16/2016 10:01:46: 	MeanOfFeatures = Mean()
 08/16/2016 10:01:46: 	InvStdOfFeatures = InvStdDev()
 08/16/2016 10:01:46: 	Prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 08/16/2016 10:01:46: Precomputing --> Completed.
 08/16/2016 10:01:46: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
 minibatchiterator: epoch 0: frames [0..2048] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 08/16/2016 10:01:46: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
 08/16/2016 10:01:46: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.41144794 * 2048; EvalErrorPrediction = 0.92773438 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.023072s
 08/16/2016 10:01:46: SGD: Saving checkpoint model '/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn'
 08/16/2016 10:01:46: CNTKCommandTrainEnd: speechTrain
 08/16/2016 10:01:46: Action "train" complete.
 08/16/2016 10:01:46: __COMPLETED__
 ~MPIWrapper
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.release.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.release.cpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.release.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.linux.release.gpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.release.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.release.cpu.txt
@ -1,18 +1,24 @@
-=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]]
+CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 268381192 kB
 -------------------------------------------------------------------
 === Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]
 -------------------------------------------------------------------
 Build info: 
-		Built time: May  3 2016 13:15:46
+		Built time: Aug 16 2016 03:09:16
-		Last modified date: Tue Apr 26 23:35:31 2016
+		Last modified date: Fri Aug 12 05:28:23 2016
 		Build type: Release
 		Build target: GPU
-		With 1bit-SGD: no
+		With 1bit-SGD: yes
 		Math lib: mkl
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
 		CUB_PATH: c:\src\cub-1.4.1
 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
 		Build Branch: HEAD
-		Build SHA1: af96f7cce6c3c78a4f1e9315e061291c79360e12
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
-		Built by svcphil on cntk-muc01
+		Built by svcphil on Philly-Pool1
 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 -------------------------------------------------------------------
 Changed current directory to C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
@ -25,31 +31,39 @@ ping [requestnodes (after change)]: all 1 nodes responded
 mpihelper: only one MPI process: MPI operation will be boring
 ping [mpihelper]: 1 nodes pinging each other
 ping [mpihelper]: all 1 nodes responded
-05/03/2016 13:22:22: -------------------------------------------------------------------
+08/16/2016 03:20:10: -------------------------------------------------------------------
-05/03/2016 13:22:22: Build info: 
+08/16/2016 03:20:10: Build info: 
-05/03/2016 13:22:22: 		Built time: May  3 2016 13:15:46
+08/16/2016 03:20:10: 		Built time: Aug 16 2016 03:09:16
-05/03/2016 13:22:22: 		Last modified date: Tue Apr 26 23:35:31 2016
+08/16/2016 03:20:10: 		Last modified date: Fri Aug 12 05:28:23 2016
-05/03/2016 13:22:22: 		Build type: Release
+08/16/2016 03:20:10: 		Build type: Release
-05/03/2016 13:22:22: 		Build target: GPU
+08/16/2016 03:20:10: 		Build target: GPU
-05/03/2016 13:22:22: 		With 1bit-SGD: no
+08/16/2016 03:20:10: 		With 1bit-SGD: yes
-05/03/2016 13:22:22: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/16/2016 03:20:10: 		Math lib: mkl
-05/03/2016 13:22:22: 		CUB_PATH: c:\src\cub-1.4.1
+08/16/2016 03:20:10: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
-05/03/2016 13:22:22: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
+08/16/2016 03:20:10: 		CUB_PATH: c:\src\cub-1.4.1
-05/03/2016 13:22:22: 		Build Branch: HEAD
+08/16/2016 03:20:10: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
-05/03/2016 13:22:22: 		Build SHA1: af96f7cce6c3c78a4f1e9315e061291c79360e12
+08/16/2016 03:20:10: 		Build Branch: HEAD
-05/03/2016 13:22:22: 		Built by svcphil on cntk-muc01
+08/16/2016 03:20:10: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
-05/03/2016 13:22:22: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+08/16/2016 03:20:10: 		Built by svcphil on Philly-Pool1
-05/03/2016 13:22:22: -------------------------------------------------------------------
+08/16/2016 03:20:10: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 08/16/2016 03:20:10: -------------------------------------------------------------------
 08/16/2016 03:20:12: -------------------------------------------------------------------
 08/16/2016 03:20:12: GPU info:
-05/03/2016 13:22:22: Running on DPHAIM-22 at 2016/05/03 13:22:22
+08/16/2016 03:20:12: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
-05/03/2016 13:22:22: Command line: 
+08/16/2016 03:20:12: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
-C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]
+08/16/2016 03:20:12: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
 08/16/2016 03:20:12: -------------------------------------------------------------------
 08/16/2016 03:20:12: Running on DPHAIM-25 at 2016/08/16 03:20:12
 08/16/2016 03:20:12: Command line: 
 C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]  speechTrain=[reader=[useMersenneTwisterRand=true]]
-05/03/2016 13:22:22: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:12: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-05/03/2016 13:22:22: RootDir = ".."
+08/16/2016 03:20:12: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
@ -65,7 +79,7 @@ speechTrain = [
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -111,35 +125,36 @@ speechTrain = [
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
-05/03/2016 13:22:22: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:12: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
-05/03/2016 13:22:22: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:12: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-05/03/2016 13:22:22: RootDir = ".."
+08/16/2016 03:20:12: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
-ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models"
+ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = "1"
-modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn"
+modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn"
 parallelTrain = true
 speechTrain = [
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -185,36 +200,37 @@ speechTrain = [
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
-05/03/2016 13:22:22: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:12: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/03/2016 13:22:22: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:12: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: FeedForward.cntk:command=speechTrain
 configparameters: FeedForward.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 configparameters: FeedForward.cntk:currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: FeedForward.cntk:DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: FeedForward.cntk:deviceId=-1
-configparameters: FeedForward.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models
+configparameters: FeedForward.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models
-configparameters: FeedForward.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
+configparameters: FeedForward.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
-configparameters: FeedForward.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+configparameters: FeedForward.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 configparameters: FeedForward.cntk:parallelTrain=true
 configparameters: FeedForward.cntk:precision=float
 configparameters: FeedForward.cntk:RootDir=..
-configparameters: FeedForward.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu
+configparameters: FeedForward.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu
 configparameters: FeedForward.cntk:speechTrain=[
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -258,24 +274,24 @@ configparameters: FeedForward.cntk:speechTrain=[
            labelType = "category"
        ]
    ]
-] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]] [reader=[useMersenneTwisterRand=true]]
 configparameters: FeedForward.cntk:timestamping=true
 configparameters: FeedForward.cntk:traceLevel=1
-05/03/2016 13:22:22: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:12: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/03/2016 13:22:22: Commands: speechTrain
+08/16/2016 03:20:12: Commands: speechTrain
-05/03/2016 13:22:22: Precision = "float"
+08/16/2016 03:20:12: Precision = "float"
-05/03/2016 13:22:22: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
+08/16/2016 03:20:12: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn
-05/03/2016 13:22:22: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 03:20:12: CNTKCommandTrainInfo: speechTrain : 1
-05/03/2016 13:22:22: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+08/16/2016 03:20:12: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
-05/03/2016 13:22:22: ##############################################################################
+08/16/2016 03:20:12: ##############################################################################
-05/03/2016 13:22:22: #                                                                            #
+08/16/2016 03:20:12: #                                                                            #
-05/03/2016 13:22:22: # Action "train"                                                             #
+08/16/2016 03:20:12: # Action "train"                                                             #
-05/03/2016 13:22:22: #                                                                            #
+08/16/2016 03:20:12: #                                                                            #
-05/03/2016 13:22:22: ##############################################################################
+08/16/2016 03:20:12: ##############################################################################
-05/03/2016 13:22:22: CNTKCommandTrainBegin: speechTrain
+08/16/2016 03:20:12: CNTKCommandTrainBegin: speechTrain
 SimpleNetworkBuilder Using CPU
 reading script file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp ... 948 entries
 total 132 state names in state list C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list
@ -284,13 +300,25 @@ htkmlfreader: reading MLF file C:\jenkins\workspace\CNTK-Test-Windows-W1\Example
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
-05/03/2016 13:22:23: Creating virgin network.
+08/16/2016 03:20:12: Creating virgin network.
 Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- 0.000000.
 Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- 0.000000.
 Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
 Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- 0.000000.
 Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
 Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -319,7 +347,7 @@ Validating --> W2*H1 = Times (W2, H2) : [132 x 512], [512 x 1 x *] -> [132 x 1 x
 Validating --> B2 = LearnableParameter() :  -> [132 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 x *] -> [132 x 1 x *]
 Validating --> Prior = Mean (labels) : [132 x *] -> [132]
 Validating --> LogOfPrior = Log (Prior) : [132] -> [132]
@ -336,70 +364,70 @@ Validating network, final pass.
 Post-processing network complete.
-05/03/2016 13:22:23: Created model with 25 nodes on CPU.
+08/16/2016 03:20:12: Created model with 25 nodes on CPU.
-05/03/2016 13:22:23: Training criterion node(s):
+08/16/2016 03:20:12: Training criterion node(s):
-05/03/2016 13:22:23: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+08/16/2016 03:20:12: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
-05/03/2016 13:22:23: Evaluation criterion node(s):
+08/16/2016 03:20:12: Evaluation criterion node(s):
-
+08/16/2016 03:20:12: 	EvalErrorPrediction = ErrorPrediction
 05/03/2016 13:22:23: 	EvalClassificationError = ClassificationError
 Allocating matrices for forward and/or backward propagation.
-Memory Sharing Structure:
+Memory Sharing: Out of 40 matrices, 19 are shared as 8, and 21 are not shared.
-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[363]] [LogOfPrior Gradient[132]] [MVNormalizedFeatures Gradient[363 x *]] [MeanOfFeatures Gradient[363]] [PosteriorProb Gradient[132 x 1 x *]] [PosteriorProb Value[132 x 1 x *]] [Prior Gradient[132]] [ScaledLogLikelihood Gradient[132 x 1 x *]] [features Gradient[363 x *]] [labels Gradient[132 x *]] }
+	{ W0*features+B0 : [512 x 1 x *] (gradient)
-000000BDD334C430: {[features Value[363 x *]] }
+	  W1*H1 : [512 x 1 x *] }
-000000BDD334C4D0: {[W0 Value[512 x 363]] }
+	{ W0 : [512 x 363] (gradient)
-000000BDD334C610: {[MeanOfFeatures Value[363]] }
+	  W0*features+B0 : [512 x 1 x *] }
-000000BDD334C890: {[B0 Value[512 x 1]] }
+	{ H1 : [512 x 1 x *]
-000000BDD334CCF0: {[W1 Value[512 x 512]] }
+	  W0*features : [512 x *] (gradient) }
-000000BDD334CE30: {[B1 Value[512 x 1]] }
+	{ W1 : [512 x 512] (gradient)
-000000BDD334D1F0: {[InvStdOfFeatures Value[363]] }
+	  W1*H1+B1 : [512 x 1 x *] }
-000000BDD5BCA080: {[Prior Value[132]] }
+	{ H2 : [512 x 1 x *]
-000000BDD5BCA120: {[EvalClassificationError Value[1]] }
+	  W1*H1 : [512 x 1 x *] (gradient) }
-000000BDD5BCA260: {[W2 Value[132 x 512]] }
+	{ HLast : [132 x 1 x *]
-000000BDD5BCA440: {[labels Value[132 x *]] }
+	  W2 : [132 x 512] (gradient) }
-000000BDD5BCA6C0: {[MVNormalizedFeatures Value[363 x *]] }
+	{ B0 : [512 x 1] (gradient)
-000000BDD5BCAE40: {[B0 Gradient[512 x 1]] [H1 Gradient[512 x 1 x *]] [W1*H1+B1 Gradient[512 x 1 x *]] [W2*H1 Value[132 x 1 x *]] }
+	  H1 : [512 x 1 x *] (gradient)
-000000BDD5BCAEE0: {[CrossEntropyWithSoftmax Gradient[1]] }
+	  W1*H1+B1 : [512 x 1 x *] (gradient)
-000000BDD5BCAF80: {[B1 Gradient[512 x 1]] [H2 Gradient[512 x 1 x *]] [HLast Gradient[132 x 1 x *]] }
+	  W2*H1 : [132 x 1 x *] }
-000000BDD5BCB0C0: {[H1 Value[512 x 1 x *]] [W0*features Gradient[512 x *]] }
+	{ B1 : [512 x 1] (gradient)
-000000BDD5BCB160: {[ScaledLogLikelihood Value[132 x 1 x *]] }
+	  H2 : [512 x 1 x *] (gradient)
-000000BDD5BCB340: {[W0 Gradient[512 x 363]] [W0*features+B0 Value[512 x 1 x *]] }
+	  HLast : [132 x 1 x *] (gradient) }
 000000BDD5BCB520: {[W1 Gradient[512 x 512]] [W1*H1+B1 Value[512 x 1 x *]] }
 000000BDD5BCB5C0: {[B2 Gradient[132 x 1]] }
 000000BDD5BCB700: {[W0*features Value[512 x *]] }
 000000BDD5BCB7A0: {[HLast Value[132 x 1 x *]] [W2 Gradient[132 x 512]] }
 000000BDD5BCB8E0: {[LogOfPrior Value[132]] }
 000000BDD5BCB980: {[H2 Value[512 x 1 x *]] [W1*H1 Gradient[512 x 1 x *]] }
 000000BDD5BCBAC0: {[B2 Value[132 x 1]] }
 000000BDD5BCBB60: {[CrossEntropyWithSoftmax Value[1]] }
 000000BDD5BCBC00: {[W0*features+B0 Gradient[512 x 1 x *]] [W1*H1 Value[512 x 1 x *]] }
 000000BDD5BCBCA0: {[W2*H1 Gradient[132 x 1 x *]] }
-05/03/2016 13:22:23: Precomputing --> 3 PreCompute nodes found.
+08/16/2016 03:20:12: Training 516740 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:
-05/03/2016 13:22:23: 	MeanOfFeatures = Mean()
+08/16/2016 03:20:12: 	Node 'B0' (LearnableParameter operation) : [512 x 1]
-05/03/2016 13:22:23: 	InvStdOfFeatures = InvStdDev()
+08/16/2016 03:20:12: 	Node 'B1' (LearnableParameter operation) : [512 x 1]
-05/03/2016 13:22:23: 	Prior = Mean()
+08/16/2016 03:20:12: 	Node 'B2' (LearnableParameter operation) : [132 x 1]
 08/16/2016 03:20:12: 	Node 'W0' (LearnableParameter operation) : [512 x 363]
 08/16/2016 03:20:12: 	Node 'W1' (LearnableParameter operation) : [512 x 512]
 08/16/2016 03:20:12: 	Node 'W2' (LearnableParameter operation) : [132 x 512]
 08/16/2016 03:20:12: Precomputing --> 3 PreCompute nodes found.
 08/16/2016 03:20:12: 	MeanOfFeatures = Mean()
 08/16/2016 03:20:12: 	InvStdOfFeatures = InvStdDev()
 08/16/2016 03:20:12: 	Prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
-05/03/2016 13:22:24: Precomputing --> Completed.
+08/16/2016 03:20:15: Precomputing --> Completed.
-05/03/2016 13:22:24: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
+08/16/2016 03:20:15: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
 minibatchiterator: epoch 0: frames [0..2048] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
-05/03/2016 13:22:24: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
+08/16/2016 03:20:15: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
-05/03/2016 13:22:25: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.48531419 * 2048; EvalClassificationError = 0.90722656 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.288909s
+08/16/2016 03:20:15: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.46427900 * 2048; EvalErrorPrediction = 0.91259766 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.28059s
-05/03/2016 13:22:25: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn'
+08/16/2016 03:20:15: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_cpu/Models/cntkSpeechFF.dnn'
-05/03/2016 13:22:25: CNTKCommandTrainEnd: speechTrain
+08/16/2016 03:20:15: CNTKCommandTrainEnd: speechTrain
-05/03/2016 13:22:25: Action "train" complete.
+08/16/2016 03:20:15: Action "train" complete.
-05/03/2016 13:22:25: __COMPLETED__
+08/16/2016 03:20:15: __COMPLETED__
 ~MPIWrapper
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.debug.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.debug.cpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.debug.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.debug.gpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.release.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/baseline.windows.release.gpu.txt
@ -1,18 +1,24 @@
-=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]]
+CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 268381192 kB
 -------------------------------------------------------------------
 === Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]
 -------------------------------------------------------------------
 Build info: 
-		Built time: May  3 2016 13:15:46
+		Built time: Aug 16 2016 03:09:16
-		Last modified date: Tue Apr 26 23:35:31 2016
+		Last modified date: Fri Aug 12 05:28:23 2016
 		Build type: Release
 		Build target: GPU
-		With 1bit-SGD: no
+		With 1bit-SGD: yes
 		Math lib: mkl
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
 		CUB_PATH: c:\src\cub-1.4.1
 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
 		Build Branch: HEAD
-		Build SHA1: af96f7cce6c3c78a4f1e9315e061291c79360e12
+		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
-		Built by svcphil on cntk-muc01
+		Built by svcphil on Philly-Pool1
 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 -------------------------------------------------------------------
 Changed current directory to C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
@ -25,31 +31,39 @@ ping [requestnodes (after change)]: all 1 nodes responded
 mpihelper: only one MPI process: MPI operation will be boring
 ping [mpihelper]: 1 nodes pinging each other
 ping [mpihelper]: all 1 nodes responded
-05/03/2016 13:22:25: -------------------------------------------------------------------
+08/16/2016 03:20:17: -------------------------------------------------------------------
-05/03/2016 13:22:25: Build info: 
+08/16/2016 03:20:17: Build info: 
-05/03/2016 13:22:25: 		Built time: May  3 2016 13:15:46
+08/16/2016 03:20:17: 		Built time: Aug 16 2016 03:09:16
-05/03/2016 13:22:25: 		Last modified date: Tue Apr 26 23:35:31 2016
+08/16/2016 03:20:17: 		Last modified date: Fri Aug 12 05:28:23 2016
-05/03/2016 13:22:25: 		Build type: Release
+08/16/2016 03:20:17: 		Build type: Release
-05/03/2016 13:22:25: 		Build target: GPU
+08/16/2016 03:20:17: 		Build target: GPU
-05/03/2016 13:22:25: 		With 1bit-SGD: no
+08/16/2016 03:20:17: 		With 1bit-SGD: yes
-05/03/2016 13:22:25: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/16/2016 03:20:17: 		Math lib: mkl
-05/03/2016 13:22:25: 		CUB_PATH: c:\src\cub-1.4.1
+08/16/2016 03:20:17: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
-05/03/2016 13:22:25: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
+08/16/2016 03:20:17: 		CUB_PATH: c:\src\cub-1.4.1
-05/03/2016 13:22:25: 		Build Branch: HEAD
+08/16/2016 03:20:17: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
-05/03/2016 13:22:25: 		Build SHA1: af96f7cce6c3c78a4f1e9315e061291c79360e12
+08/16/2016 03:20:17: 		Build Branch: HEAD
-05/03/2016 13:22:25: 		Built by svcphil on cntk-muc01
+08/16/2016 03:20:17: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
-05/03/2016 13:22:25: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+08/16/2016 03:20:17: 		Built by svcphil on Philly-Pool1
-05/03/2016 13:22:25: -------------------------------------------------------------------
+08/16/2016 03:20:17: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 08/16/2016 03:20:17: -------------------------------------------------------------------
 08/16/2016 03:20:19: -------------------------------------------------------------------
 08/16/2016 03:20:19: GPU info:
-05/03/2016 13:22:25: Running on DPHAIM-22 at 2016/05/03 13:22:25
+08/16/2016 03:20:19: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
-05/03/2016 13:22:25: Command line: 
+08/16/2016 03:20:19: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
-C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]
+08/16/2016 03:20:19: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
 08/16/2016 03:20:19: -------------------------------------------------------------------
 08/16/2016 03:20:19: Running on DPHAIM-25 at 2016/08/16 03:20:19
 08/16/2016 03:20:19: Command line: 
 C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/FeedForward.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=2048]]  speechTrain=[reader=[useMersenneTwisterRand=true]]
-05/03/2016 13:22:25: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:19: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-05/03/2016 13:22:25: RootDir = ".."
+08/16/2016 03:20:19: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
@ -65,7 +79,7 @@ speechTrain = [
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -111,35 +125,36 @@ speechTrain = [
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
-05/03/2016 13:22:25: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:19: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
-05/03/2016 13:22:25: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:19: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-05/03/2016 13:22:25: RootDir = ".."
+08/16/2016 03:20:19: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
-ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models"
+ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = "1"
-modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn"
+modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn"
 parallelTrain = true
 speechTrain = [
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -185,36 +200,37 @@ speechTrain = [
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=2048]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
-05/03/2016 13:22:25: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:19: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/03/2016 13:22:25: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/16/2016 03:20:19: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: FeedForward.cntk:command=speechTrain
 configparameters: FeedForward.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 configparameters: FeedForward.cntk:currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: FeedForward.cntk:DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: FeedForward.cntk:deviceId=0
-configparameters: FeedForward.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models
+configparameters: FeedForward.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models
-configparameters: FeedForward.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
+configparameters: FeedForward.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
-configparameters: FeedForward.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+configparameters: FeedForward.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 configparameters: FeedForward.cntk:parallelTrain=true
 configparameters: FeedForward.cntk:precision=float
 configparameters: FeedForward.cntk:RootDir=..
-configparameters: FeedForward.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu
+configparameters: FeedForward.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu
 configparameters: FeedForward.cntk:speechTrain=[
    action = "train"
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ClassificationError"
+        evalCriterion = "ErrorPrediction"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
@ -258,24 +274,24 @@ configparameters: FeedForward.cntk:speechTrain=[
            labelType = "category"
        ]
    ]
-] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]]
+] [SGD=[maxEpochs=1]] [SGD=[epochSize=2048]] [reader=[useMersenneTwisterRand=true]]
 configparameters: FeedForward.cntk:timestamping=true
 configparameters: FeedForward.cntk:traceLevel=1
-05/03/2016 13:22:25: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/16/2016 03:20:19: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/03/2016 13:22:25: Commands: speechTrain
+08/16/2016 03:20:19: Commands: speechTrain
-05/03/2016 13:22:25: Precision = "float"
+08/16/2016 03:20:19: Precision = "float"
-05/03/2016 13:22:25: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
+08/16/2016 03:20:19: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn
-05/03/2016 13:22:25: CNTKCommandTrainInfo: speechTrain : 1
+08/16/2016 03:20:19: CNTKCommandTrainInfo: speechTrain : 1
-05/03/2016 13:22:25: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
+08/16/2016 03:20:19: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
-05/03/2016 13:22:25: ##############################################################################
+08/16/2016 03:20:19: ##############################################################################
-05/03/2016 13:22:25: #                                                                            #
+08/16/2016 03:20:19: #                                                                            #
-05/03/2016 13:22:25: # Action "train"                                                             #
+08/16/2016 03:20:19: # Action "train"                                                             #
-05/03/2016 13:22:25: #                                                                            #
+08/16/2016 03:20:19: #                                                                            #
-05/03/2016 13:22:25: ##############################################################################
+08/16/2016 03:20:19: ##############################################################################
-05/03/2016 13:22:25: CNTKCommandTrainBegin: speechTrain
+08/16/2016 03:20:19: CNTKCommandTrainBegin: speechTrain
 SimpleNetworkBuilder Using GPU 0
 reading script file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp ... 948 entries
 total 132 state names in state list C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list
@ -284,14 +300,26 @@ htkmlfreader: reading MLF file C:\jenkins\workspace\CNTK-Test-Windows-W1\Example
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
-05/03/2016 13:22:25: Creating virgin network.
+08/16/2016 03:20:19: Creating virgin network.
 Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- 0.000000.
 Node 'W0' (LearnableParameter operation): Initializing Parameter[512 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
 Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'B0' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- 0.000000.
 Node 'W1' (LearnableParameter operation): Initializing Parameter[512 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
 Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'B1' (LearnableParameter operation): Initializing Parameter[512 x 1] <- 0.000000.
 Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- 0.000000.
 Node 'W2' (LearnableParameter operation): Initializing Parameter[132 x 512] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
 Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Node 'B2' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Post-processing network...
 7 roots:
 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax()
-	EvalClassificationError = ClassificationError()
+	EvalErrorPrediction = ErrorPrediction()
 	InvStdOfFeatures = InvStdDev()
 	MeanOfFeatures = Mean()
 	PosteriorProb = Softmax()
@ -320,7 +348,7 @@ Validating --> W2*H1 = Times (W2, H2) : [132 x 512], [512 x 1 x *] -> [132 x 1 x
 Validating --> B2 = LearnableParameter() :  -> [132 x 1]
 Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
-Validating --> EvalClassificationError = ClassificationError (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 x *] -> [132 x 1 x *]
 Validating --> Prior = Mean (labels) : [132 x *] -> [132]
 Validating --> LogOfPrior = Log (Prior) : [132] -> [132]
@ -337,70 +365,70 @@ Validating network, final pass.
 Post-processing network complete.
-05/03/2016 13:22:26: Created model with 25 nodes on GPU 0.
+08/16/2016 03:20:20: Created model with 25 nodes on GPU 0.
-05/03/2016 13:22:26: Training criterion node(s):
+08/16/2016 03:20:20: Training criterion node(s):
-05/03/2016 13:22:26: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+08/16/2016 03:20:20: 	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
-05/03/2016 13:22:26: Evaluation criterion node(s):
+08/16/2016 03:20:20: Evaluation criterion node(s):
-
+08/16/2016 03:20:20: 	EvalErrorPrediction = ErrorPrediction
 05/03/2016 13:22:26: 	EvalClassificationError = ClassificationError
 Allocating matrices for forward and/or backward propagation.
-Memory Sharing Structure:
+Memory Sharing: Out of 40 matrices, 19 are shared as 8, and 21 are not shared.
-0000000000000000: {[EvalClassificationError Gradient[1]] [InvStdOfFeatures Gradient[363]] [LogOfPrior Gradient[132]] [MVNormalizedFeatures Gradient[363 x *]] [MeanOfFeatures Gradient[363]] [PosteriorProb Gradient[132 x 1 x *]] [PosteriorProb Value[132 x 1 x *]] [Prior Gradient[132]] [ScaledLogLikelihood Gradient[132 x 1 x *]] [features Gradient[363 x *]] [labels Gradient[132 x *]] }
+	{ W0*features+B0 : [512 x 1 x *] (gradient)
-00000087D360C610: {[features Value[363 x *]] }
+	  W1*H1 : [512 x 1 x *] }
-00000087EB4FEEF0: {[W0 Value[512 x 363]] }
+	{ H2 : [512 x 1 x *]
-00000087EB4FF530: {[B1 Value[512 x 1]] }
+	  W1*H1 : [512 x 1 x *] (gradient) }
-00000087EB4FF850: {[W1 Value[512 x 512]] }
+	{ HLast : [132 x 1 x *]
-00000087EB4FFC10: {[W2 Value[132 x 512]] }
+	  W2 : [132 x 512] (gradient) }
-00000087EB500070: {[B2 Value[132 x 1]] }
+	{ W0 : [512 x 363] (gradient)
-00000087EB5001B0: {[MeanOfFeatures Value[363]] }
+	  W0*features+B0 : [512 x 1 x *] }
-00000087EB500250: {[InvStdOfFeatures Value[363]] }
+	{ B0 : [512 x 1] (gradient)
-00000087EB5004D0: {[B0 Value[512 x 1]] }
+	  H1 : [512 x 1 x *] (gradient)
-00000087EDA2B150: {[labels Value[132 x *]] }
+	  W1*H1+B1 : [512 x 1 x *] (gradient)
-00000087EDA2B330: {[B1 Gradient[512 x 1]] [H2 Gradient[512 x 1 x *]] [HLast Gradient[132 x 1 x *]] }
+	  W2*H1 : [132 x 1 x *] }
-00000087EDA2B3D0: {[Prior Value[132]] }
+	{ H1 : [512 x 1 x *]
-00000087EDA2B6F0: {[HLast Value[132 x 1 x *]] [W2 Gradient[132 x 512]] }
+	  W0*features : [512 x *] (gradient) }
-00000087EDA2B8D0: {[W0 Gradient[512 x 363]] [W0*features+B0 Value[512 x 1 x *]] }
+	{ W1 : [512 x 512] (gradient)
-00000087EDA2BB50: {[CrossEntropyWithSoftmax Value[1]] }
+	  W1*H1+B1 : [512 x 1 x *] }
-00000087EDA2BC90: {[W0*features+B0 Gradient[512 x 1 x *]] [W1*H1 Value[512 x 1 x *]] }
+	{ B1 : [512 x 1] (gradient)
-00000087EDA2C0F0: {[EvalClassificationError Value[1]] }
+	  H2 : [512 x 1 x *] (gradient)
-00000087EDA2C190: {[W0*features Value[512 x *]] }
+	  HLast : [132 x 1 x *] (gradient) }
 00000087EDA2C2D0: {[H1 Value[512 x 1 x *]] [W0*features Gradient[512 x *]] }
 00000087EDA2C370: {[W2*H1 Gradient[132 x 1 x *]] }
 00000087EDA2C410: {[B2 Gradient[132 x 1]] }
 00000087EDA2C730: {[ScaledLogLikelihood Value[132 x 1 x *]] }
 00000087EDA2C7D0: {[LogOfPrior Value[132]] }
 00000087EDA2CAF0: {[MVNormalizedFeatures Value[363 x *]] }
 00000087EDA2CB90: {[H2 Value[512 x 1 x *]] [W1*H1 Gradient[512 x 1 x *]] }
 00000087EDA2CCD0: {[B0 Gradient[512 x 1]] [H1 Gradient[512 x 1 x *]] [W1*H1+B1 Gradient[512 x 1 x *]] [W2*H1 Value[132 x 1 x *]] }
 00000087EDA2CEB0: {[CrossEntropyWithSoftmax Gradient[1]] }
 00000087EDA2CFF0: {[W1 Gradient[512 x 512]] [W1*H1+B1 Value[512 x 1 x *]] }
-05/03/2016 13:22:26: Precomputing --> 3 PreCompute nodes found.
+08/16/2016 03:20:20: Training 516740 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:
-05/03/2016 13:22:26: 	MeanOfFeatures = Mean()
+08/16/2016 03:20:20: 	Node 'B0' (LearnableParameter operation) : [512 x 1]
-05/03/2016 13:22:26: 	InvStdOfFeatures = InvStdDev()
+08/16/2016 03:20:20: 	Node 'B1' (LearnableParameter operation) : [512 x 1]
-05/03/2016 13:22:26: 	Prior = Mean()
+08/16/2016 03:20:20: 	Node 'B2' (LearnableParameter operation) : [132 x 1]
 08/16/2016 03:20:20: 	Node 'W0' (LearnableParameter operation) : [512 x 363]
 08/16/2016 03:20:20: 	Node 'W1' (LearnableParameter operation) : [512 x 512]
 08/16/2016 03:20:20: 	Node 'W2' (LearnableParameter operation) : [132 x 512]
 08/16/2016 03:20:20: Precomputing --> 3 PreCompute nodes found.
 08/16/2016 03:20:20: 	MeanOfFeatures = Mean()
 08/16/2016 03:20:20: 	InvStdOfFeatures = InvStdDev()
 08/16/2016 03:20:20: 	Prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
-05/03/2016 13:22:27: Precomputing --> Completed.
+08/16/2016 03:20:21: Precomputing --> Completed.
-05/03/2016 13:22:27: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
+08/16/2016 03:20:21: Starting Epoch 1: learning rate per sample = 0.003906  effective momentum = 0.900000  momentum as time constant = 2429.8 samples
 minibatchiterator: epoch 0: frames [0..2048] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
-05/03/2016 13:22:27: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
+08/16/2016 03:20:21: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 1, NumGradientBits = 1), distributed reading is ENABLED.
-05/03/2016 13:22:27: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.42832291 * 2048; EvalClassificationError = 0.91357422 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.052947s
+08/16/2016 03:20:21: Finished Epoch[ 1 of 1]: [Training] CrossEntropyWithSoftmax = 4.41144794 * 2048; EvalErrorPrediction = 0.92773438 * 2048; totalSamplesSeen = 2048; learningRatePerSample = 0.00390625; epochTime=0.05551s
-05/03/2016 13:22:27: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160503132211.330996\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn'
+08/16/2016 03:20:21: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_FeedForward@release_gpu/Models/cntkSpeechFF.dnn'
-05/03/2016 13:22:27: CNTKCommandTrainEnd: speechTrain
+08/16/2016 03:20:21: CNTKCommandTrainEnd: speechTrain
-05/03/2016 13:22:27: Action "train" complete.
+08/16/2016 03:20:21: Action "train" complete.
-05/03/2016 13:22:27: __COMPLETED__
+08/16/2016 03:20:21: __COMPLETED__
 ~MPIWrapper
--- a/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/FeedForward/run-test
@ -5,5 +5,5 @@
 ConfigDir=$TEST_DIR/../../../../../../Examples/Speech/AN4/Config
 # cntkrun <CNTK config file name> <additional CNTK args>
-cntkrun FeedForward.cntk "speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]]" || exit $?
+cntkrun FeedForward.cntk "speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=2048]] speechTrain=[reader=[useMersenneTwisterRand=true]]" || exit $?
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.cpu.txt
@ -0,0 +1,682 @@
 CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 264172964 kB
 -------------------------------------------------------------------
 === Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/LSTM-NDL.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false
 -------------------------------------------------------------------
 Build info: 
 		Built time: Aug 16 2016 09:41:57
 		Last modified date: Mon Aug 15 23:39:17 2016
 		Build type: release
 		Build target: GPU
 		With 1bit-SGD: yes
 		Math lib: mkl
 		CUDA_PATH: /usr/local/cuda-7.5
 		CUB_PATH: /usr/local/cub-1.4.1
 		CUDNN_PATH: /usr/local/cudnn-4.0
 		Build Branch: HEAD
 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 		Built by philly on 643085f7f8c2
 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 -------------------------------------------------------------------
 Changed current directory to /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 08/16/2016 10:01:47: -------------------------------------------------------------------
 08/16/2016 10:01:47: Build info: 
 08/16/2016 10:01:47: 		Built time: Aug 16 2016 09:41:57
 08/16/2016 10:01:47: 		Last modified date: Mon Aug 15 23:39:17 2016
 08/16/2016 10:01:47: 		Build type: release
 08/16/2016 10:01:47: 		Build target: GPU
 08/16/2016 10:01:47: 		With 1bit-SGD: yes
 08/16/2016 10:01:47: 		Math lib: mkl
 08/16/2016 10:01:47: 		CUDA_PATH: /usr/local/cuda-7.5
 08/16/2016 10:01:47: 		CUB_PATH: /usr/local/cub-1.4.1
 08/16/2016 10:01:47: 		CUDNN_PATH: /usr/local/cudnn-4.0
 08/16/2016 10:01:47: 		Build Branch: HEAD
 08/16/2016 10:01:47: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 08/16/2016 10:01:47: 		Built by philly on 643085f7f8c2
 08/16/2016 10:01:47: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 08/16/2016 10:01:47: -------------------------------------------------------------------
 08/16/2016 10:01:47: -------------------------------------------------------------------
 08/16/2016 10:01:47: GPU info:
 08/16/2016 10:01:47: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:47: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:47: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:47: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:01:47: -------------------------------------------------------------------
 08/16/2016 10:01:47: Running on localhost at 2016/08/16 10:01:47
 08/16/2016 10:01:47: Command line: 
 /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/LSTM-NDL.cntk  currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu  DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config  OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  speechTrain=[reader=[useMersenneTwisterRand=true]]  parallelTrain=false
 08/16/2016 10:01:47: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 08/16/2016 10:01:47: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
 ModelDir = "$OutputDir$/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = 1
 modelPath = "$ModelDir$/cntkSpeechLSTM.dnn"
 parallelTrain = true
 frameMode = false
 truncated = true
 speechTrain = [
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "$ConfigDir$/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "$DataDir$/glob_0000.scp"
        ]
        labels = [
            mlfFile = "$DataDir$/glob_0000.mlf"
            labelMappingFile = "$DataDir$/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
 DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
 OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=64]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 parallelTrain=false
 08/16/2016 10:01:47: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:47: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 08/16/2016 10:01:47: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
 ModelDir = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = 1
 modelPath = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn"
 parallelTrain = true
 frameMode = false
 truncated = true
 speechTrain = [
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
 DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
 OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=64]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 parallelTrain=false
 08/16/2016 10:01:47: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:47: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: LSTM-NDL.cntk:command=speechTrain
 configparameters: LSTM-NDL.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
 configparameters: LSTM-NDL.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 configparameters: LSTM-NDL.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 configparameters: LSTM-NDL.cntk:deviceId=-1
 configparameters: LSTM-NDL.cntk:frameMode=false
 configparameters: LSTM-NDL.cntk:ModelDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models
 configparameters: LSTM-NDL.cntk:modelPath=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn
 configparameters: LSTM-NDL.cntk:OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
 configparameters: LSTM-NDL.cntk:parallelTrain=false
 configparameters: LSTM-NDL.cntk:precision=float
 configparameters: LSTM-NDL.cntk:RootDir=..
 configparameters: LSTM-NDL.cntk:RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu
 configparameters: LSTM-NDL.cntk:speechTrain=[
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ] [SGD=[maxEpochs=1]] [SGD=[epochSize=64]] [reader=[useMersenneTwisterRand=true]]
 configparameters: LSTM-NDL.cntk:timestamping=true
 configparameters: LSTM-NDL.cntk:traceLevel=1
 configparameters: LSTM-NDL.cntk:truncated=true
 08/16/2016 10:01:47: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:01:47: Commands: speechTrain
 08/16/2016 10:01:47: Precision = "float"
 08/16/2016 10:01:47: CNTKModelPath: /tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn
 08/16/2016 10:01:47: CNTKCommandTrainInfo: speechTrain : 1
 08/16/2016 10:01:47: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
 08/16/2016 10:01:47: ##############################################################################
 08/16/2016 10:01:47: #                                                                            #
 08/16/2016 10:01:47: # Action "train"                                                             #
 08/16/2016 10:01:47: #                                                                            #
 08/16/2016 10:01:47: ##############################################################################
 08/16/2016 10:01:47: CNTKCommandTrainBegin: speechTrain
 NDLBuilder Using CPU
 reading script file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp ... 948 entries
 total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list
 htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
 useParallelTrain option is not enabled. ParallelTrain config will be ignored.
 08/16/2016 10:01:48: Creating virgin network.
 Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=4, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=5, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=6, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=9, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=10, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=11, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=12, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=15, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=16, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=17, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=18, range=0.050000*1.000000, onCPU=false).
 Node 'W' (LearnableParameter operation): Initializating Parameter[132 x 0] as uniform later when dimensions are fully known.
 Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Post-processing network...
 6 roots:
 	ce = CrossEntropyWithSoftmax()
 	err = ErrorPrediction()
 	featNorm.xMean = Mean()
 	featNorm.xStdDev = InvStdDev()
 	logPrior.prior = Mean()
 	scaledLogLikelihood = Minus()
 Loop[0] --> Loop_LSTMoutput1.output -> 24 nodes
 	LSTMoutput1.dh	LSTMoutput1.whh	LSTMoutput1.wxxpbpwhh
 	LSTMoutput1.G4	LSTMoutput1.G3	LSTMoutput1.dc
 	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft
 	LSTMoutput1.bft	LSTMoutput1.G1	LSTMoutput1.Wcidc
 	LSTMoutput1.unnamed163	LSTMoutput1.it	LSTMoutput1.G2
 	LSTMoutput1.unnamed164	LSTMoutput1.bit	LSTMoutput1.ct
 	LSTMoutput1.Wcoct	LSTMoutput1.unnamed166	LSTMoutput1.ot
 	LSTMoutput1.unnamed167	LSTMoutput1.mt	LSTMoutput1.output
 Loop[1] --> Loop_LSTMoutput2.output -> 24 nodes
 	LSTMoutput2.dh	LSTMoutput2.whh	LSTMoutput2.wxxpbpwhh
 	LSTMoutput2.G4	LSTMoutput2.G3	LSTMoutput2.dc
 	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed175	LSTMoutput2.ft
 	LSTMoutput2.bft	LSTMoutput2.G1	LSTMoutput2.Wcidc
 	LSTMoutput2.unnamed173	LSTMoutput2.it	LSTMoutput2.G2
 	LSTMoutput2.unnamed174	LSTMoutput2.bit	LSTMoutput2.ct
 	LSTMoutput2.Wcoct	LSTMoutput2.unnamed176	LSTMoutput2.ot
 	LSTMoutput2.unnamed177	LSTMoutput2.mt	LSTMoutput2.output
 Loop[2] --> Loop_LSTMoutput3.output -> 24 nodes
 	LSTMoutput3.dh	LSTMoutput3.whh	LSTMoutput3.wxxpbpwhh
 	LSTMoutput3.G4	LSTMoutput3.G3	LSTMoutput3.dc
 	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed185	LSTMoutput3.ft
 	LSTMoutput3.bft	LSTMoutput3.G1	LSTMoutput3.Wcidc
 	LSTMoutput3.unnamed183	LSTMoutput3.it	LSTMoutput3.G2
 	LSTMoutput3.unnamed184	LSTMoutput3.bit	LSTMoutput3.ct
 	LSTMoutput3.Wcoct	LSTMoutput3.unnamed186	LSTMoutput3.ot
 	LSTMoutput3.unnamed187	LSTMoutput3.mt	LSTMoutput3.output
 Validating network. 113 nodes to process in pass 1.
 Validating --> labels = InputValue() :  -> [132 x *]
 Validating --> W = LearnableParameter() :  -> [132 x 0]
 Validating --> LSTMoutput3.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput3.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput2.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput2.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput1.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput1.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> features = InputValue() :  -> [363 x *]
 Validating --> featNorm.xMean = Mean (features) : [363 x *] -> [363]
 Validating --> featNorm.xStdDev = InvStdDev (features) : [363 x *] -> [363]
 Validating --> featNorm.xNorm = PerDimMeanVarNormalization (features, featNorm.xMean, featNorm.xStdDev) : [363 x *], [363], [363] -> [363 x *]
 Node 'LSTMoutput1.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 363].
 Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializing Parameter[4096 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput1.wxx = Times (LSTMoutput1.wx, featNorm.xNorm) : [4096 x 363], [363 x *] -> [4096 x *]
 Validating --> LSTMoutput1.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput1.wxxpb = Plus (LSTMoutput1.wxx, LSTMoutput1.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput1.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput1.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput1.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput1.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput1.wxxpbpwhh = Plus (LSTMoutput1.wxxpb, LSTMoutput1.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.G4 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G3 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput1.unnamed165 = Plus (LSTMoutput1.G3, LSTMoutput1.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ft = Sigmoid (LSTMoutput1.unnamed165) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.bft = ElementTimes (LSTMoutput1.ft, LSTMoutput1.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G1 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput1.unnamed163 = Plus (LSTMoutput1.G1, LSTMoutput1.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.it = Sigmoid (LSTMoutput1.unnamed163) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G2 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed164 = Tanh (LSTMoutput1.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.bit = ElementTimes (LSTMoutput1.it, LSTMoutput1.unnamed164) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ct = Plus (LSTMoutput1.bft, LSTMoutput1.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcoct = DiagTimes (LSTMoutput1.Wco, LSTMoutput1.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed166 = Plus (LSTMoutput1.G4, LSTMoutput1.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ot = Sigmoid (LSTMoutput1.unnamed166) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed167 = Tanh (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.mt = ElementTimes (LSTMoutput1.ot, LSTMoutput1.unnamed167) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.output = Times (LSTMoutput1.Wmr, LSTMoutput1.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'LSTMoutput2.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
 Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=7, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput2.wxx = Times (LSTMoutput2.wx, LSTMoutput1.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
 Validating --> LSTMoutput2.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput2.wxxpb = Plus (LSTMoutput2.wxx, LSTMoutput2.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput2.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput2.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput2.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput2.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=8, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput2.wxxpbpwhh = Plus (LSTMoutput2.wxxpb, LSTMoutput2.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.G4 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G3 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput2.unnamed175 = Plus (LSTMoutput2.G3, LSTMoutput2.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ft = Sigmoid (LSTMoutput2.unnamed175) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.bft = ElementTimes (LSTMoutput2.ft, LSTMoutput2.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G1 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput2.unnamed173 = Plus (LSTMoutput2.G1, LSTMoutput2.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.it = Sigmoid (LSTMoutput2.unnamed173) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G2 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed174 = Tanh (LSTMoutput2.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.bit = ElementTimes (LSTMoutput2.it, LSTMoutput2.unnamed174) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ct = Plus (LSTMoutput2.bft, LSTMoutput2.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcoct = DiagTimes (LSTMoutput2.Wco, LSTMoutput2.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed176 = Plus (LSTMoutput2.G4, LSTMoutput2.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ot = Sigmoid (LSTMoutput2.unnamed176) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed177 = Tanh (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.mt = ElementTimes (LSTMoutput2.ot, LSTMoutput2.unnamed177) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.output = Times (LSTMoutput2.Wmr, LSTMoutput2.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'LSTMoutput3.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
 Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=13, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput3.wxx = Times (LSTMoutput3.wx, LSTMoutput2.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
 Validating --> LSTMoutput3.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput3.wxxpb = Plus (LSTMoutput3.wxx, LSTMoutput3.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput3.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput3.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput3.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput3.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=14, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput3.wxxpbpwhh = Plus (LSTMoutput3.wxxpb, LSTMoutput3.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.G4 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G3 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput3.unnamed185 = Plus (LSTMoutput3.G3, LSTMoutput3.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ft = Sigmoid (LSTMoutput3.unnamed185) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.bft = ElementTimes (LSTMoutput3.ft, LSTMoutput3.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G1 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput3.unnamed183 = Plus (LSTMoutput3.G1, LSTMoutput3.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.it = Sigmoid (LSTMoutput3.unnamed183) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G2 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed184 = Tanh (LSTMoutput3.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.bit = ElementTimes (LSTMoutput3.it, LSTMoutput3.unnamed184) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ct = Plus (LSTMoutput3.bft, LSTMoutput3.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcoct = DiagTimes (LSTMoutput3.Wco, LSTMoutput3.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed186 = Plus (LSTMoutput3.G4, LSTMoutput3.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ot = Sigmoid (LSTMoutput3.unnamed186) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed187 = Tanh (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.mt = ElementTimes (LSTMoutput3.ot, LSTMoutput3.unnamed187) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.output = Times (LSTMoutput3.Wmr, LSTMoutput3.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'W' (LearnableParameter operation) operation: Tensor shape was inferred as [132 x 512 x 1].
 Node 'W' (LearnableParameter operation): Initializing Parameter[132 x 512 x 1] <- uniform(seed=19, range=0.050000*1.000000, onCPU=false).
 Validating --> unnamed193 = Times (W, LSTMoutput3.output) : [132 x 512 x 1], [512 x 1 x *] -> [132 x *]
 Validating --> b = LearnableParameter() :  -> [132 x 1]
 Validating --> LSTMoutputW = Plus (unnamed193, b) : [132 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> err = ErrorPrediction (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> logPrior.prior = Mean (labels) : [132 x *] -> [132]
 Validating --> logPrior.logPrior = Log (logPrior.prior) : [132] -> [132]
 Validating --> scaledLogLikelihood = Minus (LSTMoutputW, logPrior.logPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
 Validating network. 88 nodes to process in pass 2.
 Validating --> LSTMoutput1.dh = PastValue (LSTMoutput1.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.dc = PastValue (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.dh = PastValue (LSTMoutput2.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.dc = PastValue (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.dh = PastValue (LSTMoutput3.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.dc = PastValue (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating network. 15 nodes to process in pass 3.
 Validating network, final pass.
 29 out of 113 nodes do not share the minibatch layout with the input data.
 Post-processing network complete.
 08/16/2016 10:01:48: Created model with 113 nodes on CPU.
 08/16/2016 10:01:48: Training criterion node(s):
 08/16/2016 10:01:48: 	ce = CrossEntropyWithSoftmax
 08/16/2016 10:01:48: Evaluation criterion node(s):
 08/16/2016 10:01:48: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing: Out of 217 matrices, 125 are shared as 56, and 92 are not shared.
 	{ LSTMoutput1.dh : [512 x 1 x *]
 	  LSTMoutput1.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput2.Wco : [1024] (gradient)
 	  LSTMoutput3.dc : [1024 x 1 x *] }
 	{ LSTMoutput1.Wmr : [512 x 1024] (gradient)
 	  LSTMoutput2.wxx : [4096 x *] }
 	{ LSTMoutput2.wx : [4096 x 512 x 1] (gradient)
 	  LSTMoutput2.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput1.ot : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.whh : [4096 x 1 x *] }
 	{ LSTMoutput1.ct : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] }
 	{ LSTMoutput1.G4 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G4 : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed164 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcfdc : [1024 x 1 x *] }
 	{ LSTMoutput1.wxxpbpwhh : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed175 : [1024 x 1 x *] }
 	{ LSTMoutput1.G1 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.ft : [1024 x 1 x *] }
 	{ LSTMoutput1.Wci : [1024] (gradient)
 	  LSTMoutput2.G1 : [1024 x 1 x *] }
 	{ LSTMoutput1.G3 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcidc : [1024 x 1 x *] }
 	{ LSTMoutput1.Wcf : [1024] (gradient)
 	  LSTMoutput2.it : [1024 x 1 x *] }
 	{ LSTMoutput1.whh : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.G2 : [1024 x 1 x *] }
 	{ LSTMoutput1.b : [4096 x 1] (gradient)
 	  LSTMoutput1.dh : [512 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed174 : [1024 x 1 x *] }
 	{ LSTMoutput2.Wmr : [512 x 1024] (gradient)
 	  LSTMoutput3.wxx : [4096 x *] }
 	{ LSTMoutput3.wx : [4096 x 512 x 1] (gradient)
 	  LSTMoutput3.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput2.ot : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.whh : [4096 x 1 x *] }
 	{ LSTMoutput2.ct : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] }
 	{ LSTMoutput1.Wcoct : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G4 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G4 : [1024 x 1 x *] }
 	{ LSTMoutput2.unnamed174 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcfdc : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed166 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed185 : [1024 x 1 x *] }
 	{ LSTMoutput1.dc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G1 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.ft : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed165 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.bft : [1024 x 1 x *] }
 	{ LSTMoutput2.Wci : [1024] (gradient)
 	  LSTMoutput3.G1 : [1024 x 1 x *] }
 	{ LSTMoutput2.G3 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcidc : [1024 x 1 x *] }
 	{ LSTMoutput1.it : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed183 : [1024 x 1 x *] }
 	{ LSTMoutput2.Wcf : [1024] (gradient)
 	  LSTMoutput3.it : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed167 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.whh : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.G2 : [1024 x 1 x *] }
 	{ LSTMoutput2.b : [4096 x 1] (gradient)
 	  LSTMoutput2.dh : [512 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed184 : [1024 x 1 x *] }
 	{ LSTMoutput3.Wmr : [512 x 1024] (gradient)
 	  unnamed193 : [132 x *] }
 	{ LSTMoutputW : [132 x 1 x *]
 	  W : [132 x 512 x 1] (gradient) }
 	{ LSTMoutput1.mt : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.dh : [512 x 1 x *]
 	  LSTMoutput2.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput1.wx : [4096 x 363] (gradient)
 	  LSTMoutput1.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput2.mt : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.dh : [512 x 1 x *]
 	  LSTMoutput3.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput3.output : [512 x 1 x *] (gradient)
 	  LSTMoutputW : [132 x 1 x *] (gradient) }
 	{ LSTMoutput3.mt : [1024 x 1 x *] (gradient)
 	  unnamed193 : [132 x *] (gradient) }
 	{ LSTMoutput2.Wcoct : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G4 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.ft : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.bft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.output : [512 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpb : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.it : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.Wh : [4096 x 512] (gradient)
 	  LSTMoutput3.G2 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed176 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput1.bit : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed183 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.bft : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.dc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G1 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.G2 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcfdc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcidc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.unnamed163 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed175 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wcidc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.ft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.bft : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.dc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wcfdc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcidc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.ft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed173 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed185 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wh : [4096 x 512] (gradient)
 	  LSTMoutput2.G2 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcfdc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.wxxpb : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.it : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.output : [512 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpb : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed177 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.whh : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput3.b : [4096 x 1] (gradient)
 	  LSTMoutput3.dh : [512 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wco : [1024] (gradient)
 	  LSTMoutput2.dc : [1024 x 1 x *] }
 08/16/2016 10:01:48: Training 13634692 parameters in 23 out of 23 parameter tensors and 104 nodes with gradient:
 08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 10:01:48: 	Node 'LSTMoutput1.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput1.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 10:01:48: 	Node 'LSTMoutput1.wx' (LearnableParameter operation) : [4096 x 363]
 08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 10:01:48: 	Node 'LSTMoutput2.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput2.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 10:01:48: 	Node 'LSTMoutput2.wx' (LearnableParameter operation) : [4096 x 512 x 1]
 08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 10:01:48: 	Node 'LSTMoutput3.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 10:01:48: 	Node 'LSTMoutput3.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 10:01:48: 	Node 'LSTMoutput3.wx' (LearnableParameter operation) : [4096 x 512 x 1]
 08/16/2016 10:01:48: 	Node 'W' (LearnableParameter operation) : [132 x 512 x 1]
 08/16/2016 10:01:48: 	Node 'b' (LearnableParameter operation) : [132 x 1]
 08/16/2016 10:01:48: Precomputing --> 3 PreCompute nodes found.
 08/16/2016 10:01:48: 	featNorm.xMean = Mean()
 08/16/2016 10:01:48: 	featNorm.xStdDev = InvStdDev()
 08/16/2016 10:01:48: 	logPrior.prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 08/16/2016 10:01:49: Precomputing --> Completed.
 08/16/2016 10:01:50: Starting Epoch 1: learning rate per sample = 0.001953  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 minibatchiterator: epoch 0: frames [0..64] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 08/16/2016 10:01:50: Starting minibatch loop.
 08/16/2016 10:01:53:  Epoch[ 1 of 1]-Minibatch[   1-  10, 250.00%]: ce = 4.87313957 * 160; err = 0.90625000 * 160; time = 3.3910s; samplesPerSecond = 47.2
 08/16/2016 10:01:56:  Epoch[ 1 of 1]-Minibatch[  11-  20, 500.00%]: ce = 4.84521751 * 160; err = 0.69375000 * 160; time = 2.9626s; samplesPerSecond = 54.0
 08/16/2016 10:01:58: Finished Epoch[ 1 of 1]: [Training] ce = 4.85644356 * 418; err = 0.80382775 * 418; totalSamplesSeen = 418; learningRatePerSample = 0.001953125; epochTime=8.39953s
 08/16/2016 10:01:59: SGD: Saving checkpoint model '/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn'
 08/16/2016 10:01:59: CNTKCommandTrainEnd: speechTrain
 08/16/2016 10:01:59: Action "train" complete.
 08/16/2016 10:01:59: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.debug.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.debug.cpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.debug.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.debug.gpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.gpu.txt
@ -0,0 +1,683 @@
 CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 264172964 kB
 -------------------------------------------------------------------
 === Running /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/LSTM-NDL.cntk currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false
 -------------------------------------------------------------------
 Build info: 
 		Built time: Aug 16 2016 09:41:57
 		Last modified date: Mon Aug 15 23:39:17 2016
 		Build type: release
 		Build target: GPU
 		With 1bit-SGD: yes
 		Math lib: mkl
 		CUDA_PATH: /usr/local/cuda-7.5
 		CUB_PATH: /usr/local/cub-1.4.1
 		CUDNN_PATH: /usr/local/cudnn-4.0
 		Build Branch: HEAD
 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 		Built by philly on 643085f7f8c2
 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 -------------------------------------------------------------------
 Changed current directory to /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 08/16/2016 10:02:00: -------------------------------------------------------------------
 08/16/2016 10:02:00: Build info: 
 08/16/2016 10:02:00: 		Built time: Aug 16 2016 09:41:57
 08/16/2016 10:02:00: 		Last modified date: Mon Aug 15 23:39:17 2016
 08/16/2016 10:02:00: 		Build type: release
 08/16/2016 10:02:00: 		Build target: GPU
 08/16/2016 10:02:00: 		With 1bit-SGD: yes
 08/16/2016 10:02:00: 		Math lib: mkl
 08/16/2016 10:02:00: 		CUDA_PATH: /usr/local/cuda-7.5
 08/16/2016 10:02:00: 		CUB_PATH: /usr/local/cub-1.4.1
 08/16/2016 10:02:00: 		CUDNN_PATH: /usr/local/cudnn-4.0
 08/16/2016 10:02:00: 		Build Branch: HEAD
 08/16/2016 10:02:00: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 08/16/2016 10:02:00: 		Built by philly on 643085f7f8c2
 08/16/2016 10:02:00: 		Build Path: /home/philly/jenkins/workspace/CNTK-Build-Linux
 08/16/2016 10:02:00: -------------------------------------------------------------------
 08/16/2016 10:02:01: -------------------------------------------------------------------
 08/16/2016 10:02:01: GPU info:
 08/16/2016 10:02:01: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:02:01: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:02:01: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:02:01: 		Device[3]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3071 MB
 08/16/2016 10:02:01: -------------------------------------------------------------------
 08/16/2016 10:02:01: Running on localhost at 2016/08/16 10:02:01
 08/16/2016 10:02:01: Command line: 
 /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/build/1bitsgd/release/bin/cntk  configFile=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/LSTM-NDL.cntk  currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu  DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data  ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config  OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  speechTrain=[reader=[useMersenneTwisterRand=true]]  parallelTrain=false
 08/16/2016 10:02:01: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 08/16/2016 10:02:01: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
 ModelDir = "$OutputDir$/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = 1
 modelPath = "$ModelDir$/cntkSpeechLSTM.dnn"
 parallelTrain = true
 frameMode = false
 truncated = true
 speechTrain = [
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "$ConfigDir$/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "$DataDir$/glob_0000.scp"
        ]
        labels = [
            mlfFile = "$DataDir$/glob_0000.mlf"
            labelMappingFile = "$DataDir$/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
 DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
 OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=64]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 parallelTrain=false
 08/16/2016 10:02:01: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:02:01: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 08/16/2016 10:02:01: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
 ModelDir = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = 1
 modelPath = "/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn"
 parallelTrain = true
 frameMode = false
 truncated = true
 speechTrain = [
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
 DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
 OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=64]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 parallelTrain=false
 08/16/2016 10:02:01: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:02:01: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: LSTM-NDL.cntk:command=speechTrain
 configparameters: LSTM-NDL.cntk:ConfigDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config
 configparameters: LSTM-NDL.cntk:currentDirectory=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 configparameters: LSTM-NDL.cntk:DataDir=/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data
 configparameters: LSTM-NDL.cntk:deviceId=0
 configparameters: LSTM-NDL.cntk:frameMode=false
 configparameters: LSTM-NDL.cntk:ModelDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models
 configparameters: LSTM-NDL.cntk:modelPath=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn
 configparameters: LSTM-NDL.cntk:OutputDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
 configparameters: LSTM-NDL.cntk:parallelTrain=false
 configparameters: LSTM-NDL.cntk:precision=float
 configparameters: LSTM-NDL.cntk:RootDir=..
 configparameters: LSTM-NDL.cntk:RunDir=/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu
 configparameters: LSTM-NDL.cntk:speechTrain=[
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/../../../../../../Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf"
            labelMappingFile = "/home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ] [SGD=[maxEpochs=1]] [SGD=[epochSize=64]] [reader=[useMersenneTwisterRand=true]]
 configparameters: LSTM-NDL.cntk:timestamping=true
 configparameters: LSTM-NDL.cntk:traceLevel=1
 configparameters: LSTM-NDL.cntk:truncated=true
 08/16/2016 10:02:01: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 10:02:01: Commands: speechTrain
 08/16/2016 10:02:01: Precision = "float"
 08/16/2016 10:02:01: CNTKModelPath: /tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn
 08/16/2016 10:02:01: CNTKCommandTrainInfo: speechTrain : 1
 08/16/2016 10:02:01: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
 08/16/2016 10:02:01: ##############################################################################
 08/16/2016 10:02:01: #                                                                            #
 08/16/2016 10:02:01: # Action "train"                                                             #
 08/16/2016 10:02:01: #                                                                            #
 08/16/2016 10:02:01: ##############################################################################
 08/16/2016 10:02:01: CNTKCommandTrainBegin: speechTrain
 NDLBuilder Using GPU 0
 reading script file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.scp ... 948 entries
 total 132 state names in state list /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/state.list
 htkmlfreader: reading MLF file /home/philly/jenkins/workspace/CNTK-Test-Linux-W1/Examples/Speech/AN4/Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
 useParallelTrain option is not enabled. ParallelTrain config will be ignored.
 08/16/2016 10:02:01: Creating virgin network.
 Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
 SetUniformRandomValue (GPU): creating curand object with seed 3, sizeof(ElemType)==4
 Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=4, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=5, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=6, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=9, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=10, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=11, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=12, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=15, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=16, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=17, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=18, range=0.050000*1.000000, onCPU=false).
 Node 'W' (LearnableParameter operation): Initializating Parameter[132 x 0] as uniform later when dimensions are fully known.
 Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Post-processing network...
 6 roots:
 	ce = CrossEntropyWithSoftmax()
 	err = ErrorPrediction()
 	featNorm.xMean = Mean()
 	featNorm.xStdDev = InvStdDev()
 	logPrior.prior = Mean()
 	scaledLogLikelihood = Minus()
 Loop[0] --> Loop_LSTMoutput1.output -> 24 nodes
 	LSTMoutput1.dh	LSTMoutput1.whh	LSTMoutput1.wxxpbpwhh
 	LSTMoutput1.G4	LSTMoutput1.G3	LSTMoutput1.dc
 	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft
 	LSTMoutput1.bft	LSTMoutput1.G1	LSTMoutput1.Wcidc
 	LSTMoutput1.unnamed163	LSTMoutput1.it	LSTMoutput1.G2
 	LSTMoutput1.unnamed164	LSTMoutput1.bit	LSTMoutput1.ct
 	LSTMoutput1.Wcoct	LSTMoutput1.unnamed166	LSTMoutput1.ot
 	LSTMoutput1.unnamed167	LSTMoutput1.mt	LSTMoutput1.output
 Loop[1] --> Loop_LSTMoutput2.output -> 24 nodes
 	LSTMoutput2.dh	LSTMoutput2.whh	LSTMoutput2.wxxpbpwhh
 	LSTMoutput2.G4	LSTMoutput2.G3	LSTMoutput2.dc
 	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed175	LSTMoutput2.ft
 	LSTMoutput2.bft	LSTMoutput2.G1	LSTMoutput2.Wcidc
 	LSTMoutput2.unnamed173	LSTMoutput2.it	LSTMoutput2.G2
 	LSTMoutput2.unnamed174	LSTMoutput2.bit	LSTMoutput2.ct
 	LSTMoutput2.Wcoct	LSTMoutput2.unnamed176	LSTMoutput2.ot
 	LSTMoutput2.unnamed177	LSTMoutput2.mt	LSTMoutput2.output
 Loop[2] --> Loop_LSTMoutput3.output -> 24 nodes
 	LSTMoutput3.dh	LSTMoutput3.whh	LSTMoutput3.wxxpbpwhh
 	LSTMoutput3.G4	LSTMoutput3.G3	LSTMoutput3.dc
 	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed185	LSTMoutput3.ft
 	LSTMoutput3.bft	LSTMoutput3.G1	LSTMoutput3.Wcidc
 	LSTMoutput3.unnamed183	LSTMoutput3.it	LSTMoutput3.G2
 	LSTMoutput3.unnamed184	LSTMoutput3.bit	LSTMoutput3.ct
 	LSTMoutput3.Wcoct	LSTMoutput3.unnamed186	LSTMoutput3.ot
 	LSTMoutput3.unnamed187	LSTMoutput3.mt	LSTMoutput3.output
 Validating network. 113 nodes to process in pass 1.
 Validating --> labels = InputValue() :  -> [132 x *]
 Validating --> W = LearnableParameter() :  -> [132 x 0]
 Validating --> LSTMoutput3.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput3.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput2.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput2.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput1.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput1.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> features = InputValue() :  -> [363 x *]
 Validating --> featNorm.xMean = Mean (features) : [363 x *] -> [363]
 Validating --> featNorm.xStdDev = InvStdDev (features) : [363 x *] -> [363]
 Validating --> featNorm.xNorm = PerDimMeanVarNormalization (features, featNorm.xMean, featNorm.xStdDev) : [363 x *], [363], [363] -> [363 x *]
 Node 'LSTMoutput1.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 363].
 Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializing Parameter[4096 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput1.wxx = Times (LSTMoutput1.wx, featNorm.xNorm) : [4096 x 363], [363 x *] -> [4096 x *]
 Validating --> LSTMoutput1.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput1.wxxpb = Plus (LSTMoutput1.wxx, LSTMoutput1.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput1.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput1.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput1.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput1.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput1.wxxpbpwhh = Plus (LSTMoutput1.wxxpb, LSTMoutput1.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.G4 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G3 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput1.unnamed165 = Plus (LSTMoutput1.G3, LSTMoutput1.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ft = Sigmoid (LSTMoutput1.unnamed165) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.bft = ElementTimes (LSTMoutput1.ft, LSTMoutput1.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G1 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput1.unnamed163 = Plus (LSTMoutput1.G1, LSTMoutput1.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.it = Sigmoid (LSTMoutput1.unnamed163) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G2 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed164 = Tanh (LSTMoutput1.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.bit = ElementTimes (LSTMoutput1.it, LSTMoutput1.unnamed164) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ct = Plus (LSTMoutput1.bft, LSTMoutput1.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcoct = DiagTimes (LSTMoutput1.Wco, LSTMoutput1.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed166 = Plus (LSTMoutput1.G4, LSTMoutput1.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ot = Sigmoid (LSTMoutput1.unnamed166) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed167 = Tanh (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.mt = ElementTimes (LSTMoutput1.ot, LSTMoutput1.unnamed167) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.output = Times (LSTMoutput1.Wmr, LSTMoutput1.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'LSTMoutput2.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
 Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=7, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput2.wxx = Times (LSTMoutput2.wx, LSTMoutput1.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
 Validating --> LSTMoutput2.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput2.wxxpb = Plus (LSTMoutput2.wxx, LSTMoutput2.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput2.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput2.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput2.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput2.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=8, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput2.wxxpbpwhh = Plus (LSTMoutput2.wxxpb, LSTMoutput2.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.G4 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G3 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput2.unnamed175 = Plus (LSTMoutput2.G3, LSTMoutput2.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ft = Sigmoid (LSTMoutput2.unnamed175) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.bft = ElementTimes (LSTMoutput2.ft, LSTMoutput2.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G1 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput2.unnamed173 = Plus (LSTMoutput2.G1, LSTMoutput2.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.it = Sigmoid (LSTMoutput2.unnamed173) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G2 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed174 = Tanh (LSTMoutput2.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.bit = ElementTimes (LSTMoutput2.it, LSTMoutput2.unnamed174) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ct = Plus (LSTMoutput2.bft, LSTMoutput2.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcoct = DiagTimes (LSTMoutput2.Wco, LSTMoutput2.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed176 = Plus (LSTMoutput2.G4, LSTMoutput2.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ot = Sigmoid (LSTMoutput2.unnamed176) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed177 = Tanh (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.mt = ElementTimes (LSTMoutput2.ot, LSTMoutput2.unnamed177) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.output = Times (LSTMoutput2.Wmr, LSTMoutput2.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'LSTMoutput3.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
 Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=13, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput3.wxx = Times (LSTMoutput3.wx, LSTMoutput2.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
 Validating --> LSTMoutput3.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput3.wxxpb = Plus (LSTMoutput3.wxx, LSTMoutput3.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput3.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput3.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput3.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput3.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=14, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput3.wxxpbpwhh = Plus (LSTMoutput3.wxxpb, LSTMoutput3.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.G4 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G3 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput3.unnamed185 = Plus (LSTMoutput3.G3, LSTMoutput3.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ft = Sigmoid (LSTMoutput3.unnamed185) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.bft = ElementTimes (LSTMoutput3.ft, LSTMoutput3.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G1 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput3.unnamed183 = Plus (LSTMoutput3.G1, LSTMoutput3.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.it = Sigmoid (LSTMoutput3.unnamed183) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G2 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed184 = Tanh (LSTMoutput3.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.bit = ElementTimes (LSTMoutput3.it, LSTMoutput3.unnamed184) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ct = Plus (LSTMoutput3.bft, LSTMoutput3.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcoct = DiagTimes (LSTMoutput3.Wco, LSTMoutput3.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed186 = Plus (LSTMoutput3.G4, LSTMoutput3.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ot = Sigmoid (LSTMoutput3.unnamed186) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed187 = Tanh (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.mt = ElementTimes (LSTMoutput3.ot, LSTMoutput3.unnamed187) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.output = Times (LSTMoutput3.Wmr, LSTMoutput3.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'W' (LearnableParameter operation) operation: Tensor shape was inferred as [132 x 512 x 1].
 Node 'W' (LearnableParameter operation): Initializing Parameter[132 x 512 x 1] <- uniform(seed=19, range=0.050000*1.000000, onCPU=false).
 Validating --> unnamed193 = Times (W, LSTMoutput3.output) : [132 x 512 x 1], [512 x 1 x *] -> [132 x *]
 Validating --> b = LearnableParameter() :  -> [132 x 1]
 Validating --> LSTMoutputW = Plus (unnamed193, b) : [132 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> err = ErrorPrediction (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> logPrior.prior = Mean (labels) : [132 x *] -> [132]
 Validating --> logPrior.logPrior = Log (logPrior.prior) : [132] -> [132]
 Validating --> scaledLogLikelihood = Minus (LSTMoutputW, logPrior.logPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
 Validating network. 88 nodes to process in pass 2.
 Validating --> LSTMoutput1.dh = PastValue (LSTMoutput1.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.dc = PastValue (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.dh = PastValue (LSTMoutput2.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.dc = PastValue (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.dh = PastValue (LSTMoutput3.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.dc = PastValue (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating network. 15 nodes to process in pass 3.
 Validating network, final pass.
 29 out of 113 nodes do not share the minibatch layout with the input data.
 Post-processing network complete.
 08/16/2016 10:02:01: Created model with 113 nodes on GPU 0.
 08/16/2016 10:02:01: Training criterion node(s):
 08/16/2016 10:02:01: 	ce = CrossEntropyWithSoftmax
 08/16/2016 10:02:01: Evaluation criterion node(s):
 08/16/2016 10:02:01: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing: Out of 217 matrices, 125 are shared as 56, and 92 are not shared.
 	{ LSTMoutput2.mt : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.dh : [512 x 1 x *]
 	  LSTMoutput3.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput2.Wco : [1024] (gradient)
 	  LSTMoutput3.dc : [1024 x 1 x *] }
 	{ LSTMoutput1.wx : [4096 x 363] (gradient)
 	  LSTMoutput1.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput1.Wmr : [512 x 1024] (gradient)
 	  LSTMoutput2.wxx : [4096 x *] }
 	{ LSTMoutput2.wx : [4096 x 512 x 1] (gradient)
 	  LSTMoutput2.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput1.ot : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.whh : [4096 x 1 x *] }
 	{ LSTMoutput1.ct : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] }
 	{ LSTMoutput1.G4 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G4 : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed164 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcfdc : [1024 x 1 x *] }
 	{ LSTMoutput1.wxxpbpwhh : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed175 : [1024 x 1 x *] }
 	{ LSTMoutput1.G1 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.ft : [1024 x 1 x *] }
 	{ LSTMoutput1.Wci : [1024] (gradient)
 	  LSTMoutput2.G1 : [1024 x 1 x *] }
 	{ LSTMoutput1.G3 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcidc : [1024 x 1 x *] }
 	{ LSTMoutput1.Wcf : [1024] (gradient)
 	  LSTMoutput2.it : [1024 x 1 x *] }
 	{ LSTMoutput1.whh : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.G2 : [1024 x 1 x *] }
 	{ LSTMoutput1.b : [4096 x 1] (gradient)
 	  LSTMoutput1.dh : [512 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed174 : [1024 x 1 x *] }
 	{ LSTMoutput2.Wmr : [512 x 1024] (gradient)
 	  LSTMoutput3.wxx : [4096 x *] }
 	{ LSTMoutput3.wx : [4096 x 512 x 1] (gradient)
 	  LSTMoutput3.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput2.ot : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.whh : [4096 x 1 x *] }
 	{ LSTMoutput2.ct : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] }
 	{ LSTMoutput1.Wcoct : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G4 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G4 : [1024 x 1 x *] }
 	{ LSTMoutput2.unnamed174 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcfdc : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed166 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed185 : [1024 x 1 x *] }
 	{ LSTMoutput1.dc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G1 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.ft : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed165 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.bft : [1024 x 1 x *] }
 	{ LSTMoutput2.Wci : [1024] (gradient)
 	  LSTMoutput3.G1 : [1024 x 1 x *] }
 	{ LSTMoutput2.G3 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcidc : [1024 x 1 x *] }
 	{ LSTMoutput1.it : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed183 : [1024 x 1 x *] }
 	{ LSTMoutput2.Wcf : [1024] (gradient)
 	  LSTMoutput3.it : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed167 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.whh : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.G2 : [1024 x 1 x *] }
 	{ LSTMoutput2.b : [4096 x 1] (gradient)
 	  LSTMoutput2.dh : [512 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed184 : [1024 x 1 x *] }
 	{ LSTMoutput3.Wmr : [512 x 1024] (gradient)
 	  unnamed193 : [132 x *] }
 	{ LSTMoutputW : [132 x 1 x *]
 	  W : [132 x 512 x 1] (gradient) }
 	{ LSTMoutput3.output : [512 x 1 x *] (gradient)
 	  LSTMoutputW : [132 x 1 x *] (gradient) }
 	{ LSTMoutput3.mt : [1024 x 1 x *] (gradient)
 	  unnamed193 : [132 x *] (gradient) }
 	{ LSTMoutput2.Wcoct : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G4 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.ft : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.bft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.output : [512 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpb : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.it : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.Wh : [4096 x 512] (gradient)
 	  LSTMoutput3.G2 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed176 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput1.bit : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed183 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.bft : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.dc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G1 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.G2 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcfdc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcidc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.unnamed163 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed175 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wcidc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.ft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.bft : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.dc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wcfdc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcidc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.ft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed173 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed185 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wh : [4096 x 512] (gradient)
 	  LSTMoutput2.G2 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcfdc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.wxxpb : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.it : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.output : [512 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpb : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed177 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.whh : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput3.b : [4096 x 1] (gradient)
 	  LSTMoutput3.dh : [512 x 1 x *] (gradient) }
 	{ LSTMoutput1.dh : [512 x 1 x *]
 	  LSTMoutput1.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput1.mt : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.dh : [512 x 1 x *]
 	  LSTMoutput2.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput1.Wco : [1024] (gradient)
 	  LSTMoutput2.dc : [1024 x 1 x *] }
 08/16/2016 10:02:01: Training 13634692 parameters in 23 out of 23 parameter tensors and 104 nodes with gradient:
 08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 10:02:01: 	Node 'LSTMoutput1.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput1.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 10:02:01: 	Node 'LSTMoutput1.wx' (LearnableParameter operation) : [4096 x 363]
 08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 10:02:01: 	Node 'LSTMoutput2.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput2.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 10:02:01: 	Node 'LSTMoutput2.wx' (LearnableParameter operation) : [4096 x 512 x 1]
 08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 10:02:01: 	Node 'LSTMoutput3.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 10:02:01: 	Node 'LSTMoutput3.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 10:02:01: 	Node 'LSTMoutput3.wx' (LearnableParameter operation) : [4096 x 512 x 1]
 08/16/2016 10:02:01: 	Node 'W' (LearnableParameter operation) : [132 x 512 x 1]
 08/16/2016 10:02:01: 	Node 'b' (LearnableParameter operation) : [132 x 1]
 08/16/2016 10:02:01: Precomputing --> 3 PreCompute nodes found.
 08/16/2016 10:02:01: 	featNorm.xMean = Mean()
 08/16/2016 10:02:01: 	featNorm.xStdDev = InvStdDev()
 08/16/2016 10:02:01: 	logPrior.prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 08/16/2016 10:02:02: Precomputing --> Completed.
 08/16/2016 10:02:02: Starting Epoch 1: learning rate per sample = 0.001953  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 minibatchiterator: epoch 0: frames [0..64] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 08/16/2016 10:02:03: Starting minibatch loop.
 08/16/2016 10:02:03:  Epoch[ 1 of 1]-Minibatch[   1-  10, 250.00%]: ce = 4.87453079 * 160; err = 0.90625000 * 160; time = 0.5069s; samplesPerSecond = 315.6
 08/16/2016 10:02:03:  Epoch[ 1 of 1]-Minibatch[  11-  20, 500.00%]: ce = 4.84628143 * 160; err = 0.69375000 * 160; time = 0.4852s; samplesPerSecond = 329.8
 08/16/2016 10:02:04: Finished Epoch[ 1 of 1]: [Training] ce = 4.85708837 * 418; err = 0.80382775 * 418; totalSamplesSeen = 418; learningRatePerSample = 0.001953125; epochTime=1.33633s
 08/16/2016 10:02:04: SGD: Saving checkpoint model '/tmp/cntk-test-20160816100054.995555/Examples/Speech/AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn'
 08/16/2016 10:02:05: CNTKCommandTrainEnd: speechTrain
 08/16/2016 10:02:05: Action "train" complete.
 08/16/2016 10:02:05: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.release.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.release.cpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.release.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.linux.release.gpu.txt
@ -1 +0,0 @@
 __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.windows.cpu.txt
@ -0,0 +1,681 @@
 CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 268381192 kB
 -------------------------------------------------------------------
 === Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/LSTM-NDL.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu DeviceId=-1 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false
 -------------------------------------------------------------------
 Build info: 
 		Built time: Aug 16 2016 03:09:16
 		Last modified date: Fri Aug 12 05:28:23 2016
 		Build type: Release
 		Build target: GPU
 		With 1bit-SGD: yes
 		Math lib: mkl
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
 		CUB_PATH: c:\src\cub-1.4.1
 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
 		Build Branch: HEAD
 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 		Built by svcphil on Philly-Pool1
 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 -------------------------------------------------------------------
 Changed current directory to C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 08/16/2016 03:20:22: -------------------------------------------------------------------
 08/16/2016 03:20:22: Build info: 
 08/16/2016 03:20:22: 		Built time: Aug 16 2016 03:09:16
 08/16/2016 03:20:22: 		Last modified date: Fri Aug 12 05:28:23 2016
 08/16/2016 03:20:22: 		Build type: Release
 08/16/2016 03:20:22: 		Build target: GPU
 08/16/2016 03:20:22: 		With 1bit-SGD: yes
 08/16/2016 03:20:22: 		Math lib: mkl
 08/16/2016 03:20:22: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
 08/16/2016 03:20:22: 		CUB_PATH: c:\src\cub-1.4.1
 08/16/2016 03:20:22: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
 08/16/2016 03:20:22: 		Build Branch: HEAD
 08/16/2016 03:20:22: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 08/16/2016 03:20:22: 		Built by svcphil on Philly-Pool1
 08/16/2016 03:20:22: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 08/16/2016 03:20:22: -------------------------------------------------------------------
 08/16/2016 03:20:23: -------------------------------------------------------------------
 08/16/2016 03:20:23: GPU info:
 08/16/2016 03:20:23: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
 08/16/2016 03:20:23: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
 08/16/2016 03:20:23: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
 08/16/2016 03:20:23: -------------------------------------------------------------------
 08/16/2016 03:20:23: Running on DPHAIM-25 at 2016/08/16 03:20:23
 08/16/2016 03:20:23: Command line: 
 C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/LSTM-NDL.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu  DeviceId=-1  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  speechTrain=[reader=[useMersenneTwisterRand=true]]  parallelTrain=false
 08/16/2016 03:20:23: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 08/16/2016 03:20:23: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
 ModelDir = "$OutputDir$/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = 1
 modelPath = "$ModelDir$/cntkSpeechLSTM.dnn"
 parallelTrain = true
 frameMode = false
 truncated = true
 speechTrain = [
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "$ConfigDir$/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "$DataDir$/glob_0000.scp"
        ]
        labels = [
            mlfFile = "$DataDir$/glob_0000.mlf"
            labelMappingFile = "$DataDir$/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=64]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 parallelTrain=false
 08/16/2016 03:20:23: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 08/16/2016 03:20:23: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 08/16/2016 03:20:23: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
 ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = 1
 modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn"
 parallelTrain = true
 frameMode = false
 truncated = true
 speechTrain = [
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf"
            labelMappingFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
 DeviceId=-1
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=64]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 parallelTrain=false
 08/16/2016 03:20:23: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 03:20:23: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: LSTM-NDL.cntk:command=speechTrain
 configparameters: LSTM-NDL.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 configparameters: LSTM-NDL.cntk:currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: LSTM-NDL.cntk:DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: LSTM-NDL.cntk:deviceId=-1
 configparameters: LSTM-NDL.cntk:frameMode=false
 configparameters: LSTM-NDL.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models
 configparameters: LSTM-NDL.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn
 configparameters: LSTM-NDL.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
 configparameters: LSTM-NDL.cntk:parallelTrain=false
 configparameters: LSTM-NDL.cntk:precision=float
 configparameters: LSTM-NDL.cntk:RootDir=..
 configparameters: LSTM-NDL.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu
 configparameters: LSTM-NDL.cntk:speechTrain=[
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf"
            labelMappingFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ] [SGD=[maxEpochs=1]] [SGD=[epochSize=64]] [reader=[useMersenneTwisterRand=true]]
 configparameters: LSTM-NDL.cntk:timestamping=true
 configparameters: LSTM-NDL.cntk:traceLevel=1
 configparameters: LSTM-NDL.cntk:truncated=true
 08/16/2016 03:20:23: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 03:20:23: Commands: speechTrain
 08/16/2016 03:20:23: Precision = "float"
 08/16/2016 03:20:23: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn
 08/16/2016 03:20:23: CNTKCommandTrainInfo: speechTrain : 1
 08/16/2016 03:20:23: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
 08/16/2016 03:20:23: ##############################################################################
 08/16/2016 03:20:23: #                                                                            #
 08/16/2016 03:20:23: # Action "train"                                                             #
 08/16/2016 03:20:23: #                                                                            #
 08/16/2016 03:20:23: ##############################################################################
 08/16/2016 03:20:23: CNTKCommandTrainBegin: speechTrain
 NDLBuilder Using CPU
 reading script file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp ... 948 entries
 total 132 state names in state list C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list
 htkmlfreader: reading MLF file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
 useParallelTrain option is not enabled. ParallelTrain config will be ignored.
 08/16/2016 03:20:24: Creating virgin network.
 Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=4, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=5, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=6, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=9, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=10, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=11, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=12, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=15, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=16, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=17, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=18, range=0.050000*1.000000, onCPU=false).
 Node 'W' (LearnableParameter operation): Initializating Parameter[132 x 0] as uniform later when dimensions are fully known.
 Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Post-processing network...
 6 roots:
 	ce = CrossEntropyWithSoftmax()
 	err = ErrorPrediction()
 	featNorm.xMean = Mean()
 	featNorm.xStdDev = InvStdDev()
 	logPrior.prior = Mean()
 	scaledLogLikelihood = Minus()
 Loop[0] --> Loop_LSTMoutput1.output -> 24 nodes
 	LSTMoutput1.dh	LSTMoutput1.whh	LSTMoutput1.wxxpbpwhh
 	LSTMoutput1.G4	LSTMoutput1.G3	LSTMoutput1.dc
 	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft
 	LSTMoutput1.bft	LSTMoutput1.G1	LSTMoutput1.Wcidc
 	LSTMoutput1.unnamed163	LSTMoutput1.it	LSTMoutput1.G2
 	LSTMoutput1.unnamed164	LSTMoutput1.bit	LSTMoutput1.ct
 	LSTMoutput1.Wcoct	LSTMoutput1.unnamed166	LSTMoutput1.ot
 	LSTMoutput1.unnamed167	LSTMoutput1.mt	LSTMoutput1.output
 Loop[1] --> Loop_LSTMoutput2.output -> 24 nodes
 	LSTMoutput2.dh	LSTMoutput2.whh	LSTMoutput2.wxxpbpwhh
 	LSTMoutput2.G4	LSTMoutput2.G3	LSTMoutput2.dc
 	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed175	LSTMoutput2.ft
 	LSTMoutput2.bft	LSTMoutput2.G1	LSTMoutput2.Wcidc
 	LSTMoutput2.unnamed173	LSTMoutput2.it	LSTMoutput2.G2
 	LSTMoutput2.unnamed174	LSTMoutput2.bit	LSTMoutput2.ct
 	LSTMoutput2.Wcoct	LSTMoutput2.unnamed176	LSTMoutput2.ot
 	LSTMoutput2.unnamed177	LSTMoutput2.mt	LSTMoutput2.output
 Loop[2] --> Loop_LSTMoutput3.output -> 24 nodes
 	LSTMoutput3.dh	LSTMoutput3.whh	LSTMoutput3.wxxpbpwhh
 	LSTMoutput3.G4	LSTMoutput3.G3	LSTMoutput3.dc
 	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed185	LSTMoutput3.ft
 	LSTMoutput3.bft	LSTMoutput3.G1	LSTMoutput3.Wcidc
 	LSTMoutput3.unnamed183	LSTMoutput3.it	LSTMoutput3.G2
 	LSTMoutput3.unnamed184	LSTMoutput3.bit	LSTMoutput3.ct
 	LSTMoutput3.Wcoct	LSTMoutput3.unnamed186	LSTMoutput3.ot
 	LSTMoutput3.unnamed187	LSTMoutput3.mt	LSTMoutput3.output
 Validating network. 113 nodes to process in pass 1.
 Validating --> labels = InputValue() :  -> [132 x *]
 Validating --> W = LearnableParameter() :  -> [132 x 0]
 Validating --> LSTMoutput3.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput3.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput2.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput2.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput1.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput1.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> features = InputValue() :  -> [363 x *]
 Validating --> featNorm.xMean = Mean (features) : [363 x *] -> [363]
 Validating --> featNorm.xStdDev = InvStdDev (features) : [363 x *] -> [363]
 Validating --> featNorm.xNorm = PerDimMeanVarNormalization (features, featNorm.xMean, featNorm.xStdDev) : [363 x *], [363], [363] -> [363 x *]
 Node 'LSTMoutput1.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 363].
 Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializing Parameter[4096 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput1.wxx = Times (LSTMoutput1.wx, featNorm.xNorm) : [4096 x 363], [363 x *] -> [4096 x *]
 Validating --> LSTMoutput1.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput1.wxxpb = Plus (LSTMoutput1.wxx, LSTMoutput1.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput1.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput1.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput1.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput1.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput1.wxxpbpwhh = Plus (LSTMoutput1.wxxpb, LSTMoutput1.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.G4 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G3 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput1.unnamed165 = Plus (LSTMoutput1.G3, LSTMoutput1.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ft = Sigmoid (LSTMoutput1.unnamed165) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.bft = ElementTimes (LSTMoutput1.ft, LSTMoutput1.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G1 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput1.unnamed163 = Plus (LSTMoutput1.G1, LSTMoutput1.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.it = Sigmoid (LSTMoutput1.unnamed163) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G2 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed164 = Tanh (LSTMoutput1.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.bit = ElementTimes (LSTMoutput1.it, LSTMoutput1.unnamed164) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ct = Plus (LSTMoutput1.bft, LSTMoutput1.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcoct = DiagTimes (LSTMoutput1.Wco, LSTMoutput1.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed166 = Plus (LSTMoutput1.G4, LSTMoutput1.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ot = Sigmoid (LSTMoutput1.unnamed166) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed167 = Tanh (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.mt = ElementTimes (LSTMoutput1.ot, LSTMoutput1.unnamed167) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.output = Times (LSTMoutput1.Wmr, LSTMoutput1.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'LSTMoutput2.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
 Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=7, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput2.wxx = Times (LSTMoutput2.wx, LSTMoutput1.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
 Validating --> LSTMoutput2.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput2.wxxpb = Plus (LSTMoutput2.wxx, LSTMoutput2.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput2.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput2.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput2.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput2.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=8, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput2.wxxpbpwhh = Plus (LSTMoutput2.wxxpb, LSTMoutput2.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.G4 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G3 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput2.unnamed175 = Plus (LSTMoutput2.G3, LSTMoutput2.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ft = Sigmoid (LSTMoutput2.unnamed175) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.bft = ElementTimes (LSTMoutput2.ft, LSTMoutput2.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G1 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput2.unnamed173 = Plus (LSTMoutput2.G1, LSTMoutput2.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.it = Sigmoid (LSTMoutput2.unnamed173) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G2 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed174 = Tanh (LSTMoutput2.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.bit = ElementTimes (LSTMoutput2.it, LSTMoutput2.unnamed174) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ct = Plus (LSTMoutput2.bft, LSTMoutput2.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcoct = DiagTimes (LSTMoutput2.Wco, LSTMoutput2.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed176 = Plus (LSTMoutput2.G4, LSTMoutput2.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ot = Sigmoid (LSTMoutput2.unnamed176) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed177 = Tanh (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.mt = ElementTimes (LSTMoutput2.ot, LSTMoutput2.unnamed177) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.output = Times (LSTMoutput2.Wmr, LSTMoutput2.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'LSTMoutput3.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
 Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=13, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput3.wxx = Times (LSTMoutput3.wx, LSTMoutput2.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
 Validating --> LSTMoutput3.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput3.wxxpb = Plus (LSTMoutput3.wxx, LSTMoutput3.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput3.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput3.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput3.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput3.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=14, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput3.wxxpbpwhh = Plus (LSTMoutput3.wxxpb, LSTMoutput3.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.G4 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G3 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput3.unnamed185 = Plus (LSTMoutput3.G3, LSTMoutput3.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ft = Sigmoid (LSTMoutput3.unnamed185) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.bft = ElementTimes (LSTMoutput3.ft, LSTMoutput3.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G1 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput3.unnamed183 = Plus (LSTMoutput3.G1, LSTMoutput3.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.it = Sigmoid (LSTMoutput3.unnamed183) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G2 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed184 = Tanh (LSTMoutput3.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.bit = ElementTimes (LSTMoutput3.it, LSTMoutput3.unnamed184) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ct = Plus (LSTMoutput3.bft, LSTMoutput3.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcoct = DiagTimes (LSTMoutput3.Wco, LSTMoutput3.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed186 = Plus (LSTMoutput3.G4, LSTMoutput3.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ot = Sigmoid (LSTMoutput3.unnamed186) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed187 = Tanh (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.mt = ElementTimes (LSTMoutput3.ot, LSTMoutput3.unnamed187) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.output = Times (LSTMoutput3.Wmr, LSTMoutput3.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'W' (LearnableParameter operation) operation: Tensor shape was inferred as [132 x 512 x 1].
 Node 'W' (LearnableParameter operation): Initializing Parameter[132 x 512 x 1] <- uniform(seed=19, range=0.050000*1.000000, onCPU=false).
 Validating --> unnamed193 = Times (W, LSTMoutput3.output) : [132 x 512 x 1], [512 x 1 x *] -> [132 x *]
 Validating --> b = LearnableParameter() :  -> [132 x 1]
 Validating --> LSTMoutputW = Plus (unnamed193, b) : [132 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> err = ErrorPrediction (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> logPrior.prior = Mean (labels) : [132 x *] -> [132]
 Validating --> logPrior.logPrior = Log (logPrior.prior) : [132] -> [132]
 Validating --> scaledLogLikelihood = Minus (LSTMoutputW, logPrior.logPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
 Validating network. 88 nodes to process in pass 2.
 Validating --> LSTMoutput1.dh = PastValue (LSTMoutput1.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.dc = PastValue (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.dh = PastValue (LSTMoutput2.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.dc = PastValue (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.dh = PastValue (LSTMoutput3.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.dc = PastValue (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating network. 15 nodes to process in pass 3.
 Validating network, final pass.
 29 out of 113 nodes do not share the minibatch layout with the input data.
 Post-processing network complete.
 08/16/2016 03:20:24: Created model with 113 nodes on CPU.
 08/16/2016 03:20:24: Training criterion node(s):
 08/16/2016 03:20:24: 	ce = CrossEntropyWithSoftmax
 08/16/2016 03:20:24: Evaluation criterion node(s):
 08/16/2016 03:20:24: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing: Out of 217 matrices, 125 are shared as 56, and 92 are not shared.
 	{ LSTMoutput1.dh : [512 x 1 x *]
 	  LSTMoutput1.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput2.mt : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.dh : [512 x 1 x *]
 	  LSTMoutput3.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput2.Wco : [1024] (gradient)
 	  LSTMoutput3.dc : [1024 x 1 x *] }
 	{ LSTMoutput1.mt : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.dh : [512 x 1 x *]
 	  LSTMoutput2.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput1.Wco : [1024] (gradient)
 	  LSTMoutput2.dc : [1024 x 1 x *] }
 	{ LSTMoutput1.G3 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcidc : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed164 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcfdc : [1024 x 1 x *] }
 	{ LSTMoutput1.Wci : [1024] (gradient)
 	  LSTMoutput2.G1 : [1024 x 1 x *] }
 	{ LSTMoutput1.wxxpbpwhh : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed175 : [1024 x 1 x *] }
 	{ LSTMoutput2.Wcf : [1024] (gradient)
 	  LSTMoutput3.it : [1024 x 1 x *] }
 	{ LSTMoutput1.ct : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] }
 	{ LSTMoutput3.wx : [4096 x 512 x 1] (gradient)
 	  LSTMoutput3.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput1.Wmr : [512 x 1024] (gradient)
 	  LSTMoutput2.wxx : [4096 x *] }
 	{ LSTMoutput1.Wcoct : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G4 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G4 : [1024 x 1 x *] }
 	{ LSTMoutput1.Wcf : [1024] (gradient)
 	  LSTMoutput2.it : [1024 x 1 x *] }
 	{ LSTMoutput2.unnamed174 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcfdc : [1024 x 1 x *] }
 	{ LSTMoutput1.G1 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.ft : [1024 x 1 x *] }
 	{ LSTMoutput1.dc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G1 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.ft : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed165 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.bft : [1024 x 1 x *] }
 	{ LSTMoutput2.G3 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcidc : [1024 x 1 x *] }
 	{ LSTMoutput1.ot : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.whh : [4096 x 1 x *] }
 	{ LSTMoutput2.ot : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.whh : [4096 x 1 x *] }
 	{ LSTMoutput2.ct : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] }
 	{ LSTMoutput1.whh : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.G2 : [1024 x 1 x *] }
 	{ LSTMoutput2.wx : [4096 x 512 x 1] (gradient)
 	  LSTMoutput2.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput1.b : [4096 x 1] (gradient)
 	  LSTMoutput1.dh : [512 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed174 : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed166 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed185 : [1024 x 1 x *] }
 	{ LSTMoutput2.Wci : [1024] (gradient)
 	  LSTMoutput3.G1 : [1024 x 1 x *] }
 	{ LSTMoutput1.it : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed183 : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed167 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.whh : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.G2 : [1024 x 1 x *] }
 	{ LSTMoutput2.Wmr : [512 x 1024] (gradient)
 	  LSTMoutput3.wxx : [4096 x *] }
 	{ LSTMoutput2.b : [4096 x 1] (gradient)
 	  LSTMoutput2.dh : [512 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed184 : [1024 x 1 x *] }
 	{ LSTMoutput1.G4 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G4 : [1024 x 1 x *] }
 	{ LSTMoutput2.unnamed176 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput1.bit : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed183 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wh : [4096 x 512] (gradient)
 	  LSTMoutput2.G2 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcfdc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.wxxpb : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.it : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed177 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.whh : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput3.output : [512 x 1 x *] (gradient)
 	  LSTMoutputW : [132 x 1 x *] (gradient) }
 	{ LSTMoutput2.bft : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.dc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.Wh : [4096 x 512] (gradient)
 	  LSTMoutput3.G2 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.bft : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.dc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G1 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.unnamed163 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed175 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed173 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed185 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.output : [512 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpb : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput3.b : [4096 x 1] (gradient)
 	  LSTMoutput3.dh : [512 x 1 x *] (gradient) }
 	{ LSTMoutput2.Wcoct : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G4 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wcidc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.ft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.G2 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcfdc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcidc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput3.Wmr : [512 x 1024] (gradient)
 	  unnamed193 : [132 x *] }
 	{ LSTMoutput1.output : [512 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpb : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.it : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput3.mt : [1024 x 1 x *] (gradient)
 	  unnamed193 : [132 x *] (gradient) }
 	{ LSTMoutput1.Wcfdc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcidc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.ft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutputW : [132 x 1 x *]
 	  W : [132 x 512 x 1] (gradient) }
 	{ LSTMoutput1.ft : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.bft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.wx : [4096 x 363] (gradient)
 	  LSTMoutput1.wxxpb : [4096 x 1 x *] }
 08/16/2016 03:20:24: Training 13634692 parameters in 23 out of 23 parameter tensors and 104 nodes with gradient:
 08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 03:20:24: 	Node 'LSTMoutput1.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput1.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 03:20:24: 	Node 'LSTMoutput1.wx' (LearnableParameter operation) : [4096 x 363]
 08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 03:20:24: 	Node 'LSTMoutput2.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput2.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 03:20:24: 	Node 'LSTMoutput2.wx' (LearnableParameter operation) : [4096 x 512 x 1]
 08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 03:20:24: 	Node 'LSTMoutput3.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 03:20:24: 	Node 'LSTMoutput3.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 03:20:24: 	Node 'LSTMoutput3.wx' (LearnableParameter operation) : [4096 x 512 x 1]
 08/16/2016 03:20:24: 	Node 'W' (LearnableParameter operation) : [132 x 512 x 1]
 08/16/2016 03:20:24: 	Node 'b' (LearnableParameter operation) : [132 x 1]
 08/16/2016 03:20:24: Precomputing --> 3 PreCompute nodes found.
 08/16/2016 03:20:24: 	featNorm.xMean = Mean()
 08/16/2016 03:20:24: 	featNorm.xStdDev = InvStdDev()
 08/16/2016 03:20:24: 	logPrior.prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 08/16/2016 03:20:27: Precomputing --> Completed.
 08/16/2016 03:20:28: Starting Epoch 1: learning rate per sample = 0.001953  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 minibatchiterator: epoch 0: frames [0..64] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 08/16/2016 03:20:28: Starting minibatch loop.
 08/16/2016 03:20:31:  Epoch[ 1 of 1]-Minibatch[   1-  10, 250.00%]: ce = 4.87950134 * 160; err = 0.90625000 * 160; time = 3.6415s; samplesPerSecond = 43.9
 08/16/2016 03:20:35:  Epoch[ 1 of 1]-Minibatch[  11-  20, 500.00%]: ce = 4.84555817 * 160; err = 0.69375000 * 160; time = 3.6742s; samplesPerSecond = 43.5
 08/16/2016 03:20:38: Finished Epoch[ 1 of 1]: [Training] ce = 4.85900003 * 418; err = 0.80382775 * 418; totalSamplesSeen = 418; learningRatePerSample = 0.001953125; epochTime=9.76851s
 08/16/2016 03:20:38: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_cpu/Models/cntkSpeechLSTM.dnn'
 08/16/2016 03:20:39: CNTKCommandTrainEnd: speechTrain
 08/16/2016 03:20:39: Action "train" complete.
 08/16/2016 03:20:39: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/baseline.windows.gpu.txt
@ -0,0 +1,682 @@
 CPU info:
    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v2 @ 2.60GHz
    Hardware threads: 24
    Total Memory: 268381192 kB
 -------------------------------------------------------------------
 === Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/LSTM-NDL.cntk currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu DeviceId=0 timestamping=true speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false
 -------------------------------------------------------------------
 Build info: 
 		Built time: Aug 16 2016 03:09:16
 		Last modified date: Fri Aug 12 05:28:23 2016
 		Build type: Release
 		Build target: GPU
 		With 1bit-SGD: yes
 		Math lib: mkl
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
 		CUB_PATH: c:\src\cub-1.4.1
 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
 		Build Branch: HEAD
 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 		Built by svcphil on Philly-Pool1
 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 -------------------------------------------------------------------
 Changed current directory to C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 08/16/2016 03:20:41: -------------------------------------------------------------------
 08/16/2016 03:20:41: Build info: 
 08/16/2016 03:20:41: 		Built time: Aug 16 2016 03:09:16
 08/16/2016 03:20:41: 		Last modified date: Fri Aug 12 05:28:23 2016
 08/16/2016 03:20:41: 		Build type: Release
 08/16/2016 03:20:41: 		Build target: GPU
 08/16/2016 03:20:41: 		With 1bit-SGD: yes
 08/16/2016 03:20:41: 		Math lib: mkl
 08/16/2016 03:20:41: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
 08/16/2016 03:20:41: 		CUB_PATH: c:\src\cub-1.4.1
 08/16/2016 03:20:41: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
 08/16/2016 03:20:41: 		Build Branch: HEAD
 08/16/2016 03:20:41: 		Build SHA1: 026b1e772b963461e189f8f00aa7ed6951298f84
 08/16/2016 03:20:41: 		Built by svcphil on Philly-Pool1
 08/16/2016 03:20:41: 		Build Path: c:\jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
 08/16/2016 03:20:41: -------------------------------------------------------------------
 08/16/2016 03:20:43: -------------------------------------------------------------------
 08/16/2016 03:20:43: GPU info:
 08/16/2016 03:20:43: 		Device[0]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
 08/16/2016 03:20:43: 		Device[1]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
 08/16/2016 03:20:43: 		Device[2]: cores = 2880; computeCapability = 3.5; type = "GeForce GTX 780 Ti"; memory = 3072 MB
 08/16/2016 03:20:43: -------------------------------------------------------------------
 08/16/2016 03:20:43: Running on DPHAIM-25 at 2016/08/16 03:20:43
 08/16/2016 03:20:43: Command line: 
 C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/LSTM-NDL.cntk  currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu  DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu  DeviceId=0  timestamping=true  speechTrain=[SGD=[maxEpochs=1]]  speechTrain=[SGD=[epochSize=64]]  speechTrain=[reader=[useMersenneTwisterRand=true]]  parallelTrain=false
 08/16/2016 03:20:43: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 08/16/2016 03:20:43: RootDir = ".."
 ConfigDir = "$RootDir$/Config"
 DataDir = "$RootDir$/Data"
 OutputDir = "$RootDir$/Output"
 ModelDir = "$OutputDir$/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = 1
 modelPath = "$ModelDir$/cntkSpeechLSTM.dnn"
 parallelTrain = true
 frameMode = false
 truncated = true
 speechTrain = [
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "$ConfigDir$/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "$DataDir$/glob_0000.scp"
        ]
        labels = [
            mlfFile = "$DataDir$/glob_0000.mlf"
            labelMappingFile = "$DataDir$/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=64]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 parallelTrain=false
 08/16/2016 03:20:43: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 08/16/2016 03:20:43: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 08/16/2016 03:20:43: RootDir = ".."
 ConfigDir = "../Config"
 DataDir = "../Data"
 OutputDir = "../Output"
 ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models"
 deviceId = -1
 command = speechTrain
 precision = "float"
 traceLevel = 1
 modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn"
 parallelTrain = true
 frameMode = false
 truncated = true
 speechTrain = [
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf"
            labelMappingFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ]
 currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
 DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
 DeviceId=0
 timestamping=true
 speechTrain=[SGD=[maxEpochs=1]]
 speechTrain=[SGD=[epochSize=64]]
 speechTrain=[reader=[useMersenneTwisterRand=true]]
 parallelTrain=false
 08/16/2016 03:20:43: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 03:20:43: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: LSTM-NDL.cntk:command=speechTrain
 configparameters: LSTM-NDL.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config
 configparameters: LSTM-NDL.cntk:currentDirectory=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: LSTM-NDL.cntk:DataDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data
 configparameters: LSTM-NDL.cntk:deviceId=0
 configparameters: LSTM-NDL.cntk:frameMode=false
 configparameters: LSTM-NDL.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models
 configparameters: LSTM-NDL.cntk:modelPath=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn
 configparameters: LSTM-NDL.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
 configparameters: LSTM-NDL.cntk:parallelTrain=false
 configparameters: LSTM-NDL.cntk:precision=float
 configparameters: LSTM-NDL.cntk:RootDir=..
 configparameters: LSTM-NDL.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu
 configparameters: LSTM-NDL.cntk:speechTrain=[
    action = "train"
    nbrUttsIneachRecurrentIter = 16
    NDLNetworkBuilder = [
        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Config/lstmp-3layer-opt.ndl"
    ]
    SGD = [
        epochSize = 0
        minibatchSize = 16
        learningRatesPerMB = 0.5
        numMBsToShowResult = 10
        momentumPerMB = 0:0.9
        maxEpochs = 4
        keepCheckPointFiles = true       
    ]
    reader = [
        readerType = "HTKMLFReader"
        readMethod = "blockRandomize"
        miniBatchMode = "partial"
        randomize = "auto"
        verbosity = 0
        features = [
            dim = 363
            type = "real"
            scpFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp"
        ]
        labels = [
            mlfFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf"
            labelMappingFile = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list"
            labelDim = 132
            labelType = "category"
        ]
    ]
 ] [SGD=[maxEpochs=1]] [SGD=[epochSize=64]] [reader=[useMersenneTwisterRand=true]]
 configparameters: LSTM-NDL.cntk:timestamping=true
 configparameters: LSTM-NDL.cntk:traceLevel=1
 configparameters: LSTM-NDL.cntk:truncated=true
 08/16/2016 03:20:43: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 08/16/2016 03:20:43: Commands: speechTrain
 08/16/2016 03:20:43: Precision = "float"
 08/16/2016 03:20:43: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn
 08/16/2016 03:20:43: CNTKCommandTrainInfo: speechTrain : 1
 08/16/2016 03:20:43: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 1
 08/16/2016 03:20:43: ##############################################################################
 08/16/2016 03:20:43: #                                                                            #
 08/16/2016 03:20:43: # Action "train"                                                             #
 08/16/2016 03:20:43: #                                                                            #
 08/16/2016 03:20:43: ##############################################################################
 08/16/2016 03:20:43: CNTKCommandTrainBegin: speechTrain
 NDLBuilder Using GPU 0
 reading script file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.scp ... 948 entries
 total 132 state names in state list C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/state.list
 htkmlfreader: reading MLF file C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Speech\AN4\Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
 useParallelTrain option is not enabled. ParallelTrain config will be ignored.
 08/16/2016 03:20:43: Creating virgin network.
 Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- 0.000000.
 Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- 0.000000.
 Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput1.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput1.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=3, range=0.050000*1.000000, onCPU=false).
 Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetUniformRandomValue (GPU): creating curand object with seed 3, sizeof(ElemType)==4
 Node 'LSTMoutput1.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=4, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=5, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput1.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=6, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput2.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput2.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=9, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=10, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=11, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput2.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=12, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput3.b' (LearnableParameter operation): Initializing Parameter[4096 x 1] <- 0.000000.
 Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializating Parameter[4096 x 0] as uniform later when dimensions are fully known.
 Node 'LSTMoutput3.Wci' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=15, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wcf' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=16, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wco' (LearnableParameter operation): Initializing Parameter[1024] <- uniform(seed=17, range=0.050000*1.000000, onCPU=false).
 Node 'LSTMoutput3.Wmr' (LearnableParameter operation): Initializing Parameter[512 x 1024] <- uniform(seed=18, range=0.050000*1.000000, onCPU=false).
 Node 'W' (LearnableParameter operation): Initializating Parameter[132 x 0] as uniform later when dimensions are fully known.
 Node 'b' (LearnableParameter operation): Initializing Parameter[132 x 1] <- 0.000000.
 Post-processing network...
 6 roots:
 	ce = CrossEntropyWithSoftmax()
 	err = ErrorPrediction()
 	featNorm.xMean = Mean()
 	featNorm.xStdDev = InvStdDev()
 	logPrior.prior = Mean()
 	scaledLogLikelihood = Minus()
 Loop[0] --> Loop_LSTMoutput1.output -> 24 nodes
 	LSTMoutput1.dh	LSTMoutput1.whh	LSTMoutput1.wxxpbpwhh
 	LSTMoutput1.G4	LSTMoutput1.G3	LSTMoutput1.dc
 	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft
 	LSTMoutput1.bft	LSTMoutput1.G1	LSTMoutput1.Wcidc
 	LSTMoutput1.unnamed163	LSTMoutput1.it	LSTMoutput1.G2
 	LSTMoutput1.unnamed164	LSTMoutput1.bit	LSTMoutput1.ct
 	LSTMoutput1.Wcoct	LSTMoutput1.unnamed166	LSTMoutput1.ot
 	LSTMoutput1.unnamed167	LSTMoutput1.mt	LSTMoutput1.output
 Loop[1] --> Loop_LSTMoutput2.output -> 24 nodes
 	LSTMoutput2.dh	LSTMoutput2.whh	LSTMoutput2.wxxpbpwhh
 	LSTMoutput2.G4	LSTMoutput2.G3	LSTMoutput2.dc
 	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed175	LSTMoutput2.ft
 	LSTMoutput2.bft	LSTMoutput2.G1	LSTMoutput2.Wcidc
 	LSTMoutput2.unnamed173	LSTMoutput2.it	LSTMoutput2.G2
 	LSTMoutput2.unnamed174	LSTMoutput2.bit	LSTMoutput2.ct
 	LSTMoutput2.Wcoct	LSTMoutput2.unnamed176	LSTMoutput2.ot
 	LSTMoutput2.unnamed177	LSTMoutput2.mt	LSTMoutput2.output
 Loop[2] --> Loop_LSTMoutput3.output -> 24 nodes
 	LSTMoutput3.dh	LSTMoutput3.whh	LSTMoutput3.wxxpbpwhh
 	LSTMoutput3.G4	LSTMoutput3.G3	LSTMoutput3.dc
 	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed185	LSTMoutput3.ft
 	LSTMoutput3.bft	LSTMoutput3.G1	LSTMoutput3.Wcidc
 	LSTMoutput3.unnamed183	LSTMoutput3.it	LSTMoutput3.G2
 	LSTMoutput3.unnamed184	LSTMoutput3.bit	LSTMoutput3.ct
 	LSTMoutput3.Wcoct	LSTMoutput3.unnamed186	LSTMoutput3.ot
 	LSTMoutput3.unnamed187	LSTMoutput3.mt	LSTMoutput3.output
 Validating network. 113 nodes to process in pass 1.
 Validating --> labels = InputValue() :  -> [132 x *]
 Validating --> W = LearnableParameter() :  -> [132 x 0]
 Validating --> LSTMoutput3.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput3.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput2.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput2.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput1.Wmr = LearnableParameter() :  -> [512 x 1024]
 Validating --> LSTMoutput1.wx = LearnableParameter() :  -> [4096 x 0]
 Validating --> features = InputValue() :  -> [363 x *]
 Validating --> featNorm.xMean = Mean (features) : [363 x *] -> [363]
 Validating --> featNorm.xStdDev = InvStdDev (features) : [363 x *] -> [363]
 Validating --> featNorm.xNorm = PerDimMeanVarNormalization (features, featNorm.xMean, featNorm.xStdDev) : [363 x *], [363], [363] -> [363 x *]
 Node 'LSTMoutput1.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 363].
 Node 'LSTMoutput1.wx' (LearnableParameter operation): Initializing Parameter[4096 x 363] <- uniform(seed=1, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput1.wxx = Times (LSTMoutput1.wx, featNorm.xNorm) : [4096 x 363], [363 x *] -> [4096 x *]
 Validating --> LSTMoutput1.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput1.wxxpb = Plus (LSTMoutput1.wxx, LSTMoutput1.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput1.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput1.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput1.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput1.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput1.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=2, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput1.wxxpbpwhh = Plus (LSTMoutput1.wxxpb, LSTMoutput1.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.G4 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G3 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput1.unnamed165 = Plus (LSTMoutput1.G3, LSTMoutput1.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ft = Sigmoid (LSTMoutput1.unnamed165) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.bft = ElementTimes (LSTMoutput1.ft, LSTMoutput1.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G1 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput1.unnamed163 = Plus (LSTMoutput1.G1, LSTMoutput1.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.it = Sigmoid (LSTMoutput1.unnamed163) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.G2 = Slice (LSTMoutput1.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed164 = Tanh (LSTMoutput1.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.bit = ElementTimes (LSTMoutput1.it, LSTMoutput1.unnamed164) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ct = Plus (LSTMoutput1.bft, LSTMoutput1.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcoct = DiagTimes (LSTMoutput1.Wco, LSTMoutput1.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed166 = Plus (LSTMoutput1.G4, LSTMoutput1.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.ot = Sigmoid (LSTMoutput1.unnamed166) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.unnamed167 = Tanh (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.mt = ElementTimes (LSTMoutput1.ot, LSTMoutput1.unnamed167) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.output = Times (LSTMoutput1.Wmr, LSTMoutput1.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'LSTMoutput2.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
 Node 'LSTMoutput2.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=7, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput2.wxx = Times (LSTMoutput2.wx, LSTMoutput1.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
 Validating --> LSTMoutput2.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput2.wxxpb = Plus (LSTMoutput2.wxx, LSTMoutput2.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput2.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput2.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput2.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput2.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput2.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=8, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput2.wxxpbpwhh = Plus (LSTMoutput2.wxxpb, LSTMoutput2.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.G4 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G3 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput2.unnamed175 = Plus (LSTMoutput2.G3, LSTMoutput2.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ft = Sigmoid (LSTMoutput2.unnamed175) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.bft = ElementTimes (LSTMoutput2.ft, LSTMoutput2.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G1 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput2.unnamed173 = Plus (LSTMoutput2.G1, LSTMoutput2.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.it = Sigmoid (LSTMoutput2.unnamed173) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.G2 = Slice (LSTMoutput2.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed174 = Tanh (LSTMoutput2.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.bit = ElementTimes (LSTMoutput2.it, LSTMoutput2.unnamed174) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ct = Plus (LSTMoutput2.bft, LSTMoutput2.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcoct = DiagTimes (LSTMoutput2.Wco, LSTMoutput2.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed176 = Plus (LSTMoutput2.G4, LSTMoutput2.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.ot = Sigmoid (LSTMoutput2.unnamed176) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.unnamed177 = Tanh (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.mt = ElementTimes (LSTMoutput2.ot, LSTMoutput2.unnamed177) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.output = Times (LSTMoutput2.Wmr, LSTMoutput2.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'LSTMoutput3.wx' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512 x 1].
 Node 'LSTMoutput3.wx' (LearnableParameter operation): Initializing Parameter[4096 x 512 x 1] <- uniform(seed=13, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput3.wxx = Times (LSTMoutput3.wx, LSTMoutput2.output) : [4096 x 512 x 1], [512 x 1 x *] -> [4096 x *]
 Validating --> LSTMoutput3.b = LearnableParameter() :  -> [4096 x 1]
 Validating --> LSTMoutput3.wxxpb = Plus (LSTMoutput3.wxx, LSTMoutput3.b) : [4096 x *], [4096 x 1] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.Wh = LearnableParameter() :  -> [4096 x 0]
 Validating --> LSTMoutput3.Wco = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput3.Wcf = LearnableParameter() :  -> [1024]
 Validating --> LSTMoutput3.Wci = LearnableParameter() :  -> [1024]
 Node 'LSTMoutput3.Wh' (LearnableParameter operation) operation: Tensor shape was inferred as [4096 x 512].
 Node 'LSTMoutput3.Wh' (LearnableParameter operation): Initializing Parameter[4096 x 512] <- uniform(seed=14, range=0.050000*1.000000, onCPU=false).
 Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512] -> [4096]
 Validating --> LSTMoutput3.wxxpbpwhh = Plus (LSTMoutput3.wxxpb, LSTMoutput3.whh) : [4096 x 1 x *], [4096] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.G4 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G3 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput3.unnamed185 = Plus (LSTMoutput3.G3, LSTMoutput3.Wcfdc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ft = Sigmoid (LSTMoutput3.unnamed185) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.bft = ElementTimes (LSTMoutput3.ft, LSTMoutput3.dc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G1 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024] -> [1024]
 Validating --> LSTMoutput3.unnamed183 = Plus (LSTMoutput3.G1, LSTMoutput3.Wcidc) : [1024 x 1 x *], [1024] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.it = Sigmoid (LSTMoutput3.unnamed183) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.G2 = Slice (LSTMoutput3.wxxpbpwhh) : [4096 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed184 = Tanh (LSTMoutput3.G2) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.bit = ElementTimes (LSTMoutput3.it, LSTMoutput3.unnamed184) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ct = Plus (LSTMoutput3.bft, LSTMoutput3.bit) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcoct = DiagTimes (LSTMoutput3.Wco, LSTMoutput3.ct) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed186 = Plus (LSTMoutput3.G4, LSTMoutput3.Wcoct) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.ot = Sigmoid (LSTMoutput3.unnamed186) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.unnamed187 = Tanh (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.mt = ElementTimes (LSTMoutput3.ot, LSTMoutput3.unnamed187) : [1024 x 1 x *], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.output = Times (LSTMoutput3.Wmr, LSTMoutput3.mt) : [512 x 1024], [1024 x 1 x *] -> [512 x 1 x *]
 Node 'W' (LearnableParameter operation) operation: Tensor shape was inferred as [132 x 512 x 1].
 Node 'W' (LearnableParameter operation): Initializing Parameter[132 x 512 x 1] <- uniform(seed=19, range=0.050000*1.000000, onCPU=false).
 Validating --> unnamed193 = Times (W, LSTMoutput3.output) : [132 x 512 x 1], [512 x 1 x *] -> [132 x *]
 Validating --> b = LearnableParameter() :  -> [132 x 1]
 Validating --> LSTMoutputW = Plus (unnamed193, b) : [132 x *], [132 x 1] -> [132 x 1 x *]
 Validating --> ce = CrossEntropyWithSoftmax (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> err = ErrorPrediction (labels, LSTMoutputW) : [132 x *], [132 x 1 x *] -> [1]
 Validating --> logPrior.prior = Mean (labels) : [132 x *] -> [132]
 Validating --> logPrior.logPrior = Log (logPrior.prior) : [132] -> [132]
 Validating --> scaledLogLikelihood = Minus (LSTMoutputW, logPrior.logPrior) : [132 x 1 x *], [132] -> [132 x 1 x *]
 Validating network. 88 nodes to process in pass 2.
 Validating --> LSTMoutput1.dh = PastValue (LSTMoutput1.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput1.whh = Times (LSTMoutput1.Wh, LSTMoutput1.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput1.dc = PastValue (LSTMoutput1.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcfdc = DiagTimes (LSTMoutput1.Wcf, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput1.Wcidc = DiagTimes (LSTMoutput1.Wci, LSTMoutput1.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.dh = PastValue (LSTMoutput2.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput2.whh = Times (LSTMoutput2.Wh, LSTMoutput2.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput2.dc = PastValue (LSTMoutput2.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcfdc = DiagTimes (LSTMoutput2.Wcf, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput2.Wcidc = DiagTimes (LSTMoutput2.Wci, LSTMoutput2.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.dh = PastValue (LSTMoutput3.output) : [512 x 1 x *] -> [512 x 1 x *]
 Validating --> LSTMoutput3.whh = Times (LSTMoutput3.Wh, LSTMoutput3.dh) : [4096 x 512], [512 x 1 x *] -> [4096 x 1 x *]
 Validating --> LSTMoutput3.dc = PastValue (LSTMoutput3.ct) : [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcfdc = DiagTimes (LSTMoutput3.Wcf, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating --> LSTMoutput3.Wcidc = DiagTimes (LSTMoutput3.Wci, LSTMoutput3.dc) : [1024], [1024 x 1 x *] -> [1024 x 1 x *]
 Validating network. 15 nodes to process in pass 3.
 Validating network, final pass.
 29 out of 113 nodes do not share the minibatch layout with the input data.
 Post-processing network complete.
 08/16/2016 03:20:44: Created model with 113 nodes on GPU 0.
 08/16/2016 03:20:44: Training criterion node(s):
 08/16/2016 03:20:44: 	ce = CrossEntropyWithSoftmax
 08/16/2016 03:20:44: Evaluation criterion node(s):
 08/16/2016 03:20:44: 	err = ErrorPrediction
 Allocating matrices for forward and/or backward propagation.
 Memory Sharing: Out of 217 matrices, 125 are shared as 56, and 92 are not shared.
 	{ LSTMoutput2.mt : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.dh : [512 x 1 x *]
 	  LSTMoutput3.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput2.Wco : [1024] (gradient)
 	  LSTMoutput3.dc : [1024 x 1 x *] }
 	{ LSTMoutput1.dh : [512 x 1 x *]
 	  LSTMoutput1.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput1.mt : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.dh : [512 x 1 x *]
 	  LSTMoutput2.wxx : [4096 x *] (gradient) }
 	{ LSTMoutput1.Wco : [1024] (gradient)
 	  LSTMoutput2.dc : [1024 x 1 x *] }
 	{ LSTMoutput3.b : [4096 x 1] (gradient)
 	  LSTMoutput3.dh : [512 x 1 x *] (gradient) }
 	{ LSTMoutput1.bft : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.dc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G1 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.G2 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcfdc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcidc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.unnamed163 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed175 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wh : [4096 x 512] (gradient)
 	  LSTMoutput2.G2 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcfdc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.bft : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.dc : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed173 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed185 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.unnamed177 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.whh : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wcidc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.ft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.Wcfdc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcidc : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.ft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.wxxpb : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.it : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.output : [512 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpb : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput2.wx : [4096 x 512 x 1] (gradient)
 	  LSTMoutput2.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput1.ct : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] }
 	{ LSTMoutput1.unnamed164 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcfdc : [1024 x 1 x *] }
 	{ LSTMoutput1.G1 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.ft : [1024 x 1 x *] }
 	{ LSTMoutput1.Wci : [1024] (gradient)
 	  LSTMoutput2.G1 : [1024 x 1 x *] }
 	{ LSTMoutput1.Wcf : [1024] (gradient)
 	  LSTMoutput2.it : [1024 x 1 x *] }
 	{ LSTMoutput1.ot : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.whh : [4096 x 1 x *] }
 	{ LSTMoutput1.G4 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G4 : [1024 x 1 x *] }
 	{ LSTMoutput1.Wmr : [512 x 1024] (gradient)
 	  LSTMoutput2.wxx : [4096 x *] }
 	{ LSTMoutput1.G3 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.Wcidc : [1024 x 1 x *] }
 	{ LSTMoutput1.whh : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.G2 : [1024 x 1 x *] }
 	{ LSTMoutput1.b : [4096 x 1] (gradient)
 	  LSTMoutput1.dh : [512 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed174 : [1024 x 1 x *] }
 	{ LSTMoutput2.Wmr : [512 x 1024] (gradient)
 	  LSTMoutput3.wxx : [4096 x *] }
 	{ LSTMoutput1.wxxpbpwhh : [4096 x 1 x *] (gradient)
 	  LSTMoutput2.unnamed175 : [1024 x 1 x *] }
 	{ LSTMoutput1.wx : [4096 x 363] (gradient)
 	  LSTMoutput1.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput2.unnamed174 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcfdc : [1024 x 1 x *] }
 	{ LSTMoutput2.G3 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.Wcidc : [1024 x 1 x *] }
 	{ LSTMoutput2.Wcoct : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G4 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput2.b : [4096 x 1] (gradient)
 	  LSTMoutput2.dh : [512 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed184 : [1024 x 1 x *] }
 	{ LSTMoutput3.output : [512 x 1 x *] (gradient)
 	  LSTMoutputW : [132 x 1 x *] (gradient) }
 	{ LSTMoutput1.ft : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.bft : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.output : [512 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpb : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.it : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.unnamed167 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.whh : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.G2 : [1024 x 1 x *] }
 	{ LSTMoutput1.unnamed166 : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.wxxpbpwhh : [4096 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed185 : [1024 x 1 x *] }
 	{ LSTMoutput2.unnamed176 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] (gradient) }
 	{ LSTMoutput3.wx : [4096 x 512 x 1] (gradient)
 	  LSTMoutput3.wxxpb : [4096 x 1 x *] }
 	{ LSTMoutput2.ct : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.wxxpbpwhh : [4096 x 1 x *] }
 	{ LSTMoutput2.ot : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.whh : [4096 x 1 x *] }
 	{ LSTMoutput3.mt : [1024 x 1 x *] (gradient)
 	  unnamed193 : [132 x *] (gradient) }
 	{ LSTMoutput2.Wh : [4096 x 512] (gradient)
 	  LSTMoutput3.G2 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput1.bit : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed183 : [1024 x 1 x *] (gradient) }
 	{ LSTMoutput3.Wmr : [512 x 1024] (gradient)
 	  unnamed193 : [132 x *] }
 	{ LSTMoutput1.unnamed165 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.bft : [1024 x 1 x *] }
 	{ LSTMoutputW : [132 x 1 x *]
 	  W : [132 x 512 x 1] (gradient) }
 	{ LSTMoutput2.Wci : [1024] (gradient)
 	  LSTMoutput3.G1 : [1024 x 1 x *] }
 	{ LSTMoutput1.dc : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G1 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.ft : [1024 x 1 x *] }
 	{ LSTMoutput2.Wcf : [1024] (gradient)
 	  LSTMoutput3.it : [1024 x 1 x *] }
 	{ LSTMoutput1.it : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.unnamed183 : [1024 x 1 x *] }
 	{ LSTMoutput1.Wcoct : [1024 x 1 x *] (gradient)
 	  LSTMoutput2.G4 : [1024 x 1 x *] (gradient)
 	  LSTMoutput3.G4 : [1024 x 1 x *] }
 08/16/2016 03:20:44: Training 13634692 parameters in 23 out of 23 parameter tensors and 104 nodes with gradient:
 08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 03:20:44: 	Node 'LSTMoutput1.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput1.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 03:20:44: 	Node 'LSTMoutput1.wx' (LearnableParameter operation) : [4096 x 363]
 08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 03:20:44: 	Node 'LSTMoutput2.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput2.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 03:20:44: 	Node 'LSTMoutput2.wx' (LearnableParameter operation) : [4096 x 512 x 1]
 08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wcf' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wci' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wco' (LearnableParameter operation) : [1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wh' (LearnableParameter operation) : [4096 x 512]
 08/16/2016 03:20:44: 	Node 'LSTMoutput3.Wmr' (LearnableParameter operation) : [512 x 1024]
 08/16/2016 03:20:44: 	Node 'LSTMoutput3.b' (LearnableParameter operation) : [4096 x 1]
 08/16/2016 03:20:44: 	Node 'LSTMoutput3.wx' (LearnableParameter operation) : [4096 x 512 x 1]
 08/16/2016 03:20:44: 	Node 'W' (LearnableParameter operation) : [132 x 512 x 1]
 08/16/2016 03:20:44: 	Node 'b' (LearnableParameter operation) : [132 x 1]
 08/16/2016 03:20:44: Precomputing --> 3 PreCompute nodes found.
 08/16/2016 03:20:44: 	featNorm.xMean = Mean()
 08/16/2016 03:20:44: 	featNorm.xStdDev = InvStdDev()
 08/16/2016 03:20:44: 	logPrior.prior = Mean()
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 08/16/2016 03:20:45: Precomputing --> Completed.
 08/16/2016 03:20:46: Starting Epoch 1: learning rate per sample = 0.001953  effective momentum = 0.000000  momentum as time constant = 0.0 samples
 minibatchiterator: epoch 0: frames [0..64] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 08/16/2016 03:20:46: Starting minibatch loop.
 08/16/2016 03:20:47:  Epoch[ 1 of 1]-Minibatch[   1-  10, 250.00%]: ce = 4.87453079 * 160; err = 0.90625000 * 160; time = 1.1338s; samplesPerSecond = 141.1
 08/16/2016 03:20:48:  Epoch[ 1 of 1]-Minibatch[  11-  20, 500.00%]: ce = 4.84628143 * 160; err = 0.69375000 * 160; time = 1.0409s; samplesPerSecond = 153.7
 08/16/2016 03:20:49: Finished Epoch[ 1 of 1]: [Training] ce = 4.85708837 * 418; err = 0.80382775 * 418; totalSamplesSeen = 418; learningRatePerSample = 0.001953125; epochTime=2.90303s
 08/16/2016 03:20:50: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160816031849.416502\Examples\Speech\AN4_LSTM@release_gpu/Models/cntkSpeechLSTM.dnn'
 08/16/2016 03:20:51: CNTKCommandTrainEnd: speechTrain
 08/16/2016 03:20:51: Action "train" complete.
 08/16/2016 03:20:51: __COMPLETED__
--- a/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/AN4/LSTM/run-test
@ -5,5 +5,5 @@
 ConfigDir=$TEST_DIR/../../../../../../Examples/Speech/AN4/Config
 # cntkrun <CNTK config file name> <additional CNTK args>
-cntkrun LSTM-NDL.cntk "speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] parallelTrain=false" || exit $?
+cntkrun LSTM-NDL.cntk "speechTrain=[SGD=[maxEpochs=1]] speechTrain=[SGD=[epochSize=64]] speechTrain=[reader=[useMersenneTwisterRand=true]] parallelTrain=false" || exit $?
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/baseline.linux.txt
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/run-test
@ -6,4 +6,4 @@
 . $TEST_DIR/../run-timit-test-common
 # cntkrun <CNTK config file name> <additional CNTK arg>
-cntkrun TIMIT_AdaptLearnRate.cntk "$CntkArguments" || exit $?
+cntkrun TIMIT_AdaptLearnRate.cntk "$CntkArguments TIMIT_TrainAdaptLR=[reader=[useMersenneTwisterRand=true]] TIMIT_TrainAdaptLR=[cvReader=[useMersenneTwisterRand=true]]" || exit $?
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/baseline.linux.txt
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/baseline.linux.txt
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
@ -6,7 +6,7 @@
 . $TEST_DIR/../run-timit-test-common
 # Train:
-cntkrun TIMIT_TrainSimpleNetwork.cntk "$CntkArguments" || exit $?
+cntkrun TIMIT_TrainSimpleNetwork.cntk "$CntkArguments TIMIT_TrainSimple=[reader=[useMersenneTwisterRand=true]]" || exit $?
 # Validate:
-cntkrun TIMIT_CrossValidateSimpleNetwork.cntk "$CntkArguments" || exit $?
+cntkrun TIMIT_CrossValidateSimpleNetwork.cntk "$CntkArguments" || exit $?
--- a/Показать больше
+++ b/Показать больше