From 3343afb09273185b6621a0bce00c926b9f5b945a Mon Sep 17 00:00:00 2001
From: Amit <amitaga@microsoft.com>
Date: Sat, 24 Oct 2015 14:31:34 -0700
Subject: [PATCH] Remove debug info generation option for CUDA compilation in
 debug flavor builds and also enable fast-math optimizations . These changes
 have been done to eliminate differences in GPU results for the E2E tests
 between debug and release flavors. Setting environment variable
 CNTK_CUDA_DEVICE_DEBUGINFO=1 will enable debug info generation. The baselines
 for all E2E tests have also been updated in accordance with this change

---
 Makefile                                      |    6 +-
 Math/Math/CNTKMathCUDA.vcxproj                |    7 +-
 .../SinglePrecision/baseline.gpu.txt          | 2608 ++++----
 .../SinglePrecision/baseline.windows.gpu.txt  | 2449 ++++++++
 .../baseline.gpu.txt                          | 1144 +++-
 .../baseline.windows.gpu.txt                  | 1146 +++-
 .../DiscriminativePreTraining/testcases.yml   |    6 +-
 .../ParallelNoQuantization/baseline.gpu.txt   |  924 ++-
 .../baseline.windows.gpu.txt                  |  973 ++-
 .../LSTM/FullUtterance/baseline.gpu.txt       | 5469 +++++++++-------
 .../FullUtterance/baseline.windows.gpu.txt    | 5494 ++++++++++-------
 Tests/Speech/LSTM/Truncated/baseline.gpu.txt  | 5128 +++++++++------
 .../LSTM/Truncated/baseline.windows.gpu.txt   | 5157 ++++++++++------
 Tests/Speech/QuickE2E/baseline.gpu.txt        | 1238 +++-
 .../Speech/QuickE2E/baseline.windows.gpu.txt  | 1214 +++-
 15 files changed, 22337 insertions(+), 10626 deletions(-)
 create mode 100644 Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.windows.gpu.txt
diff --git a/Makefile b/Makefile
index 62147fd9c..6f88951b5 100644
--- a/Makefile
+++ b/Makefile
@@ -151,7 +151,7 @@ ifeq ("$(BUILDTYPE)","debug")
 
   CXXFLAGS += -g
   CPPFLAGS += -D_DEBUG
-  CUFLAGS += -O0 -G -lineinfo  $(GENCODE_FLAGS)
+  CUFLAGS += -O0 -use_fast_math -lineinfo  $(GENCODE_FLAGS)
 endif
 
 ifeq ("$(BUILDTYPE)","release")
@@ -165,6 +165,10 @@ ifeq ("$(BUILDTYPE)","release")
   CUFLAGS += -O3 -use_fast_math -lineinfo $(GENCODE_FLAGS)
 endif
 
+ifdef CNTK_CUDA_DEVICE_DEBUGINFO
+  CUFLAGS += -G
+endif
+
 #######
 
 OBJDIR:= $(BUILD_TOP)/.build
diff --git a/Math/Math/CNTKMathCUDA.vcxproj b/Math/Math/CNTKMathCUDA.vcxproj
index d1deb47b0..283caa2cf 100644
--- a/Math/Math/CNTKMathCUDA.vcxproj
+++ b/Math/Math/CNTKMathCUDA.vcxproj
@@ -85,6 +85,9 @@
       <GenerateLineInfo>true</GenerateLineInfo>
       <AdditionalCompilerOptions>/WX</AdditionalCompilerOptions>
       <CodeGeneration>$(CudaCodeGen)</CodeGeneration>
+      <FastMath>true</FastMath>
+      <GPUDebugInfo>false</GPUDebugInfo>
+      <GPUDebugInfo Condition="'$(CNTK_CUDA_DEVICE_DEBUGINFO)'=='1'">true</GPUDebugInfo>
     </CudaCompile>
     <PostBuildEvent>
       <Command>xcopy /D /I /Y "$(CudaPath)\bin\cudart64_*.dll" $(OutputPath)</Command>
@@ -114,8 +117,6 @@
       <OptimizeReferences>true</OptimizeReferences>
     </Link>
     <CudaCompile>
-      <FastMath>true</FastMath>
-      <GPUDebugInfo>false</GPUDebugInfo>
       <HostDebugInfo>false</HostDebugInfo>
     </CudaCompile>
   </ItemDefinitionGroup>
@@ -191,4 +192,4 @@
     <Error Condition="!Exists('$(CUB_PATH)')"
       Text="CNTK requires NVIDIA CUB library v1.4.1 to build. Please download the library from https://nvlabs.github.io/cub/ and set CUB_PATH environment variable to CUB root path (e.g. c:\src\cub-1.4.1)." />
   </Target>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.gpu.txt b/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.gpu.txt
index 1c7050785..294329581 100644
--- a/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.gpu.txt
+++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.gpu.txt
@@ -1,4 +1,4 @@
-=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 4 E:\NetScale\CNTK\git_repos\public_master\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+=== Running mpiexec -n 4 /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPIWrapper: initializing MPI
 MPIWrapper: initializing MPI
 MPIWrapper: initializing MPI
@@ -8,46 +8,40 @@ ping [requestnodes (before change)]: 4 nodes pinging each other
 ping [requestnodes (before change)]: 4 nodes pinging each other
 ping [requestnodes (before change)]: 4 nodes pinging each other
 ping [requestnodes (before change)]: all 4 nodes responded
-ping [requestnodes (before change)]: all 4 nodes responded
-ping [requestnodes (before change)]: all 4 nodes responded
 requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating)
+ping [requestnodes (after change)]: 4 nodes pinging each other
+ping [requestnodes (before change)]: all 4 nodes responded
+requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating)
+ping [requestnodes (after change)]: 4 nodes pinging each other
+ping [requestnodes (before change)]: all 4 nodes responded
 requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating)
+ping [requestnodes (after change)]: 4 nodes pinging each other
 ping [requestnodes (before change)]: all 4 nodes responded
 requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating)
 ping [requestnodes (after change)]: 4 nodes pinging each other
-ping [requestnodes (after change)]: 4 nodes pinging each other
-requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating)
-ping [requestnodes (after change)]: 4 nodes pinging each other
-ping [requestnodes (after change)]: 4 nodes pinging each other
-ping [requestnodes (after change)]: all 4 nodes responded
-ping [requestnodes (after change)]: all 4 nodes responded
-ping [requestnodes (after change)]: all 4 nodes responded
-mpihelper: we are cog 1 in a gearbox of 4
 ping [requestnodes (after change)]: all 4 nodes responded
 mpihelper: we are cog 3 in a gearbox of 4
-mpihelper: we are cog 0 in a gearbox of 4
 ping [mpihelper]: 4 nodes pinging each other
+ping [mpihelper]: all 4 nodes responded
+ping [requestnodes (after change)]: all 4 nodes responded
+mpihelper: we are cog 1 in a gearbox of 4
+ping [mpihelper]: 4 nodes pinging each other
+ping [mpihelper]: all 4 nodes responded
+ping [requestnodes (after change)]: all 4 nodes responded
 mpihelper: we are cog 2 in a gearbox of 4
 ping [mpihelper]: 4 nodes pinging each other
-ping [mpihelper]: 4 nodes pinging each other
+ping [mpihelper]: all 4 nodes responded
+ping [requestnodes (after change)]: all 4 nodes responded
+mpihelper: we are cog 0 in a gearbox of 4
 ping [mpihelper]: 4 nodes pinging each other
 ping [mpihelper]: all 4 nodes responded
-ping [mpihelper]: all 4 nodes responded
-ping [mpihelper]: all 4 nodes responded
-ping [mpihelper]: all 4 nodes responded
-MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank0
-MPI Rank 0: -------------------------------------------------------------------
-MPI Rank 0: Build info: 
-MPI Rank 0: 
-MPI Rank 0: 		Built time: Aug 25 2015 17:44:46
-MPI Rank 0: 		Last modified date: Mon Aug 24 16:38:42 2015
-MPI Rank 0: 		Built by amitaga on Amitaga-Win-DT3           
-MPI Rank 0: 		Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\
-MPI Rank 0: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
-MPI Rank 0: -------------------------------------------------------------------
-MPI Rank 0: running on Amitaga-Win-DT3 at 2015/08/26 01:48:43
-MPI Rank 0: command line options: 
-MPI Rank 0: configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] 
+Redirecting stderr to file /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank0
+Redirecting stderr to file /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1
+Redirecting stderr to file /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2
+Redirecting stderr to file /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3
+MPI Rank 0: running on localhost at 2015/10/24 12:44:53
+MPI Rank 0: command line: 
+MPI Rank 0: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr 
 MPI Rank 0: 
 MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 0: deviceId=$DeviceId$
@@ -75,12 +69,11 @@ MPI Rank 0:         minibatchSize=25
 MPI Rank 0:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 0:         momentumPerMB=0.9
 MPI Rank 0:         dropoutRate=0.0
-MPI Rank 0:         maxEpochs=10
+MPI Rank 0:         maxEpochs=4
 MPI Rank 0:         ParallelTrain=[
 MPI Rank 0:             parallelizationMethod=DataParallelSGD
 MPI Rank 0:             DataParallelSGD=[
 MPI Rank 0:               gradientBits=1
-MPI Rank 0:               parallelizationStartEpoch=1
 MPI Rank 0:             ]
 MPI Rank 0:         ]
 MPI Rank 0:     ]
@@ -102,12 +95,13 @@ MPI Rank 0:         labelMappingFile=$DataDir$/SimpleMapping.txt
 MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ]
-MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
-MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 0: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 0: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
+MPI Rank 0: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
 MPI Rank 0: DeviceId=0
-MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 0: precision=float
 MPI Rank 0: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 0: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 0: 
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: 
@@ -118,7 +112,7 @@ MPI Rank 0: precision=float
 MPI Rank 0: parallelTrain=true
 MPI Rank 0: SimpleMultiGPU=[
 MPI Rank 0:     action=train
-MPI Rank 0:     modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 0:     modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
 MPI Rank 0:     deviceId=0
 MPI Rank 0:     traceLevel=1
 MPI Rank 0:     SimpleNetworkBuilder=[
@@ -137,18 +131,17 @@ MPI Rank 0:         minibatchSize=25
 MPI Rank 0:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 0:         momentumPerMB=0.9
 MPI Rank 0:         dropoutRate=0.0
-MPI Rank 0:         maxEpochs=10
+MPI Rank 0:         maxEpochs=4
 MPI Rank 0:         ParallelTrain=[
 MPI Rank 0:             parallelizationMethod=DataParallelSGD
 MPI Rank 0:             DataParallelSGD=[
 MPI Rank 0:               gradientBits=1
-MPI Rank 0:               parallelizationStartEpoch=1
 MPI Rank 0:             ]
 MPI Rank 0:         ]
 MPI Rank 0:     ]
 MPI Rank 0:     reader=[
 MPI Rank 0:       readerType=UCIFastReader
-MPI Rank 0:       file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 0:       file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
 MPI Rank 0:       miniBatchMode=Partial
 MPI Rank 0:       randomize=None
 MPI Rank 0:       verbosity=1   
@@ -160,29 +153,31 @@ MPI Rank 0:       labels=[
 MPI Rank 0: start=2      
 MPI Rank 0: dim=1        
 MPI Rank 0: labelDim=2   
-MPI Rank 0:         labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 0:         labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt
 MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ]
-MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
-MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 0: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 0: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
+MPI Rank 0: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
 MPI Rank 0: DeviceId=0
-MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 0: precision=float
 MPI Rank 0: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 0: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 0: 
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: 
 MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 0: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU
-MPI Rank 0: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 0: configparameters: SimpleMultiGPU.config:ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
+MPI Rank 0: configparameters: SimpleMultiGPU.config:DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
 MPI Rank 0: configparameters: SimpleMultiGPU.config:deviceId=0
 MPI Rank 0: configparameters: SimpleMultiGPU.config:parallelTrain=true
 MPI Rank 0: configparameters: SimpleMultiGPU.config:precision=float
-MPI Rank 0: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 0: configparameters: SimpleMultiGPU.config:RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
 MPI Rank 0: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[
 MPI Rank 0:     action=train
-MPI Rank 0:     modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 0:     modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
 MPI Rank 0:     deviceId=0
 MPI Rank 0:     traceLevel=1
 MPI Rank 0:     SimpleNetworkBuilder=[
@@ -201,18 +196,17 @@ MPI Rank 0:         minibatchSize=25
 MPI Rank 0:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 0:         momentumPerMB=0.9
 MPI Rank 0:         dropoutRate=0.0
-MPI Rank 0:         maxEpochs=10
+MPI Rank 0:         maxEpochs=4
 MPI Rank 0:         ParallelTrain=[
 MPI Rank 0:             parallelizationMethod=DataParallelSGD
 MPI Rank 0:             DataParallelSGD=[
 MPI Rank 0:               gradientBits=1
-MPI Rank 0:               parallelizationStartEpoch=1
 MPI Rank 0:             ]
 MPI Rank 0:         ]
 MPI Rank 0:     ]
 MPI Rank 0:     reader=[
 MPI Rank 0:       readerType=UCIFastReader
-MPI Rank 0:       file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 0:       file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
 MPI Rank 0:       miniBatchMode=Partial
 MPI Rank 0:       randomize=None
 MPI Rank 0:       verbosity=1   
@@ -224,45 +218,100 @@ MPI Rank 0:       labels=[
 MPI Rank 0: start=2      
 MPI Rank 0: dim=1        
 MPI Rank 0: labelDim=2   
-MPI Rank 0:         labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 0:         labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt
 MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
 MPI Rank 0: 
-MPI Rank 0: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 0: configparameters: SimpleMultiGPU.config:stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: command: SimpleMultiGPU 
 MPI Rank 0: precision = float
+MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 0: CNTKCommandTrainInfo: SimpleMultiGPU : 4
+MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+MPI Rank 0: CNTKCommandTrainBegin: SimpleMultiGPU
 MPI Rank 0: SimpleNetworkBuilder Using GPU 0
-MPI Rank 0: reading uci file E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 0: reading uci file /home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
+MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 0: GetTrainCriterionNodes  ...
 MPI Rank 0: GetEvalCriterionNodes  ...
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node CrossEntropyWithSoftmax 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 0: 
-MPI Rank 0: Validating --> labels = InputValue
-MPI Rank 0: Validating --> W2 = LearnableParameter
-MPI Rank 0: Validating --> W1 = LearnableParameter
-MPI Rank 0: Validating --> W0 = LearnableParameter
-MPI Rank 0: Validating --> features = InputValue
-MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, 3])
-MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, 3])
-MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1])
-MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 3])
-MPI Rank 0: Validating --> B0 = LearnableParameter
-MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, 3], B0[50, 1])
-MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, 3])
-MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 3])
-MPI Rank 0: Validating --> B1 = LearnableParameter
-MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, 3], B1[50, 1])
-MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, 3])
-MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 3])
-MPI Rank 0: Validating --> B2 = LearnableParameter
-MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, 3], B2[2, 1])
-MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, 3], HLast[2, 3])
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Precomputing --> 3 PreCompute nodes found.
 MPI Rank 0: 
-MPI Rank 0: Found 3 PreCompute nodes
 MPI Rank 0: 	NodeName: InvStdOfFeatures
 MPI Rank 0: 	NodeName: MeanOfFeatures
 MPI Rank 0: 	NodeName: Prior
@@ -273,250 +322,320 @@ MPI Rank 0: starting epoch 0 at record count 0, and file position 0
 MPI Rank 0: already there from last epoch
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node InvStdOfFeatures 
+MPI Rank 0: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 0: 
-MPI Rank 0: Validating --> features = InputValue
-MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, 25])
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node InvStdOfFeatures, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 0: 
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node MeanOfFeatures 
+MPI Rank 0: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 0: 
-MPI Rank 0: Validating --> features = InputValue
-MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, 25])
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node MeanOfFeatures, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 0: 
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node Prior 
+MPI Rank 0: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 0: 
-MPI Rank 0: Validating --> labels = InputValue
-MPI Rank 0: Validating --> Prior = Mean(labels[2, 25])
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node Prior, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+MPI Rank 0: 
+MPI Rank 0: Precomputing --> Completed.
 MPI Rank 0: 
 MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.020000  momentum = 0.900001 
+MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000 
 MPI Rank 0: starting epoch 0 at record count 0, and file position 0
 MPI Rank 0: already there from last epoch
 MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: 
 MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 0: 
-MPI Rank 0: 
-MPI Rank 0: Validating node EvalErrorPrediction 
-MPI Rank 0: 
-MPI Rank 0: Validating --> labels = InputValue
-MPI Rank 0: Validating --> W2 = LearnableParameter
-MPI Rank 0: Validating --> W1 = LearnableParameter
-MPI Rank 0: Validating --> W0 = LearnableParameter
-MPI Rank 0: Validating --> features = InputValue
-MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, 6])
-MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, 6])
-MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 6], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1])
-MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 6])
-MPI Rank 0: Validating --> B0 = LearnableParameter
-MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, 6], B0[50, 1])
-MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, 6])
-MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 6])
-MPI Rank 0: Validating --> B1 = LearnableParameter
-MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, 6], B1[50, 1])
-MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, 6])
-MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 6])
-MPI Rank 0: Validating --> B2 = LearnableParameter
-MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, 6], B2[2, 1])
-MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, 6], HLast[2, 6])
-MPI Rank 0: 
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70007980; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.19902s; TotalTimePerSample = 0.79607ms; SamplesPerSecond = 1256
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71514523; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.15552s; TotalTimePerSample = 0.62210ms; SamplesPerSecond = 1607
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.14888s; TotalTimePerSample = 0.59550ms; SamplesPerSecond = 1679
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70079005; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.14477s; TotalTimePerSample = 0.57906ms; SamplesPerSecond = 1726
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70605618; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.14227s; TotalTimePerSample = 0.56910ms; SamplesPerSecond = 1757
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71572435; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13676s; TotalTimePerSample = 0.54705ms; SamplesPerSecond = 1827
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72149903; EvalErr[0]PerSample = 0.47999999; TotalTime = 0.13631s; TotalTimePerSample = 0.54524ms; SamplesPerSecond = 1834
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.79845655; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13451s; TotalTimePerSample = 0.53804ms; SamplesPerSecond = 1858
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69665188; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.13043s; TotalTimePerSample = 0.52173ms; SamplesPerSecond = 1916
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70723337; EvalErr[0]PerSample = 0.49200001; TotalTime = 0.12788s; TotalTimePerSample = 0.51150ms; SamplesPerSecond = 1955
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71420360; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12629s; TotalTimePerSample = 0.50518ms; SamplesPerSecond = 1979
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69535255; EvalErr[0]PerSample = 0.43599999; TotalTime = 0.12558s; TotalTimePerSample = 0.50232ms; SamplesPerSecond = 1990
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70078611; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12260s; TotalTimePerSample = 0.49041ms; SamplesPerSecond = 2039
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71857810; EvalErr[0]PerSample = 0.54799998; TotalTime = 0.12292s; TotalTimePerSample = 0.49170ms; SamplesPerSecond = 2033
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72088283; EvalErr[0]PerSample = 0.48800001; TotalTime = 0.12233s; TotalTimePerSample = 0.48931ms; SamplesPerSecond = 2043
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71798825; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12124s; TotalTimePerSample = 0.48494ms; SamplesPerSecond = 2062
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.74162209; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12328s; TotalTimePerSample = 0.49313ms; SamplesPerSecond = 2027
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71835059; EvalErr[0]PerSample = 0.51599997; TotalTime = 0.12341s; TotalTimePerSample = 0.49363ms; SamplesPerSecond = 2025
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71529394; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12333s; TotalTimePerSample = 0.49334ms; SamplesPerSecond = 2027
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71727639; EvalErr[0]PerSample = 0.53200001; TotalTime = 0.12472s; TotalTimePerSample = 0.49886ms; SamplesPerSecond = 2004
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71745312; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12361s; TotalTimePerSample = 0.49445ms; SamplesPerSecond = 2022
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72088087; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12197s; TotalTimePerSample = 0.48789ms; SamplesPerSecond = 2049
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72006541; EvalErr[0]PerSample = 0.50800002; TotalTime = 0.12266s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71275192; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12162s; TotalTimePerSample = 0.48650ms; SamplesPerSecond = 2055
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69645119; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12120s; TotalTimePerSample = 0.48480ms; SamplesPerSecond = 2062
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70129883; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12090s; TotalTimePerSample = 0.48360ms; SamplesPerSecond = 2067
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70768166; EvalErr[0]PerSample = 0.54400003; TotalTime = 0.12161s; TotalTimePerSample = 0.48645ms; SamplesPerSecond = 2055
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69744140; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12268s; TotalTimePerSample = 0.49071ms; SamplesPerSecond = 2037
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69266015; EvalErr[0]PerSample = 0.44800001; TotalTime = 0.12182s; TotalTimePerSample = 0.48726ms; SamplesPerSecond = 2052
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69347072; EvalErr[0]PerSample = 0.49599999; TotalTime = 0.12233s; TotalTimePerSample = 0.48930ms; SamplesPerSecond = 2043
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69257420; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.12287s; TotalTimePerSample = 0.49149ms; SamplesPerSecond = 2034
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.68625975; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12417s; TotalTimePerSample = 0.49666ms; SamplesPerSecond = 2013
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69064063; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.12340s; TotalTimePerSample = 0.49358ms; SamplesPerSecond = 2025
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70192385; EvalErr[0]PerSample = 0.46000001; TotalTime = 0.12176s; TotalTimePerSample = 0.48704ms; SamplesPerSecond = 2053
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69058985; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.12238s; TotalTimePerSample = 0.48950ms; SamplesPerSecond = 2042
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.67041212; EvalErr[0]PerSample = 0.39199999; TotalTime = 0.12185s; TotalTimePerSample = 0.48742ms; SamplesPerSecond = 2051
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.65914255; EvalErr[0]PerSample = 0.35600001; TotalTime = 0.12263s; TotalTimePerSample = 0.49050ms; SamplesPerSecond = 2038
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.63919920; EvalErr[0]PerSample = 0.36399999; TotalTime = 0.12265s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.61294138; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12143s; TotalTimePerSample = 0.48572ms; SamplesPerSecond = 2058
-MPI Rank 0:  Epoch[ 1 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.55255663; EvalErr[0]PerSample = 0.18799999; TotalTime = 0.12034s; TotalTimePerSample = 0.48136ms; SamplesPerSecond = 2077
-MPI Rank 0: Finished Epoch[ 1 of 10]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.47350001; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.254118
-MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 0: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal.
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[   1-  10 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12781s; TotalTimePerSample = 0.51124ms; SamplesPerSecond = 1956
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  11-  20 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12492s; TotalTimePerSample = 0.49968ms; SamplesPerSecond = 2001
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  21-  30 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12485s; TotalTimePerSample = 0.49942ms; SamplesPerSecond = 2002
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  31-  40 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49886ms; SamplesPerSecond = 2004
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  41-  50 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70605616; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12435s; TotalTimePerSample = 0.49740ms; SamplesPerSecond = 2010
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  51-  60 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12461s; TotalTimePerSample = 0.49844ms; SamplesPerSecond = 2006
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  61-  70 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.12464s; TotalTimePerSample = 0.49856ms; SamplesPerSecond = 2005
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  71-  80 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.79845605; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12423s; TotalTimePerSample = 0.49693ms; SamplesPerSecond = 2012
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  81-  90 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12443s; TotalTimePerSample = 0.49770ms; SamplesPerSecond = 2009
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50033ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71420344; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12484s; TotalTimePerSample = 0.49937ms; SamplesPerSecond = 2002
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12513s; TotalTimePerSample = 0.50053ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71857915; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.12473s; TotalTimePerSample = 0.49892ms; SamplesPerSecond = 2004
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.12486s; TotalTimePerSample = 0.49946ms; SamplesPerSecond = 2002
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71798839; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12445s; TotalTimePerSample = 0.49778ms; SamplesPerSecond = 2008
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71835127; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.12465s; TotalTimePerSample = 0.49860ms; SamplesPerSecond = 2005
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71529462; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12445s; TotalTimePerSample = 0.49780ms; SamplesPerSecond = 2008
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71727656; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12439s; TotalTimePerSample = 0.49756ms; SamplesPerSecond = 2009
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12479s; TotalTimePerSample = 0.49914ms; SamplesPerSecond = 2003
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088397; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12444s; TotalTimePerSample = 0.49776ms; SamplesPerSecond = 2008
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72006808; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12512s; TotalTimePerSample = 0.50050ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12514s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12465s; TotalTimePerSample = 0.49861ms; SamplesPerSecond = 2005
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70129697; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12492s; TotalTimePerSample = 0.49967ms; SamplesPerSecond = 2001
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12508s; TotalTimePerSample = 0.50031ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12539s; TotalTimePerSample = 0.50154ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69266186; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.12547s; TotalTimePerSample = 0.50188ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12492s; TotalTimePerSample = 0.49970ms; SamplesPerSecond = 2001
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69257410; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12545s; TotalTimePerSample = 0.50181ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12518s; TotalTimePerSample = 0.50072ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12542s; TotalTimePerSample = 0.50169ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70192153; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12517s; TotalTimePerSample = 0.50067ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12544s; TotalTimePerSample = 0.50177ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.67041489; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12585s; TotalTimePerSample = 0.50340ms; SamplesPerSecond = 1986
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.65913971; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12536s; TotalTimePerSample = 0.50142ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.63919874; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.12517s; TotalTimePerSample = 0.50069ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.61293878; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50085ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.55255340; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.12492s; TotalTimePerSample = 0.49968ms; SamplesPerSecond = 2001
+MPI Rank 0: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.008554
+MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 0: starting epoch 1 at record count 10000, and file position 0
 MPI Rank 0: already there from last epoch
 MPI Rank 0: 
 MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.50775200; EvalErr[0]PerSample = 0.23999999; TotalTime = 0.12819s; TotalTimePerSample = 0.51276ms; SamplesPerSecond = 1950
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.43389454; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12188s; TotalTimePerSample = 0.48751ms; SamplesPerSecond = 2051
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.36675408; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12077s; TotalTimePerSample = 0.48307ms; SamplesPerSecond = 2070
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.33769274; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12255s; TotalTimePerSample = 0.49020ms; SamplesPerSecond = 2039
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.30321363; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12133s; TotalTimePerSample = 0.48531ms; SamplesPerSecond = 2060
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.29576379; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12056s; TotalTimePerSample = 0.48225ms; SamplesPerSecond = 2073
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.24924731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.24632569; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12329s; TotalTimePerSample = 0.49315ms; SamplesPerSecond = 2027
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20943311; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12289s; TotalTimePerSample = 0.49156ms; SamplesPerSecond = 2034
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19116065; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12192s; TotalTimePerSample = 0.48767ms; SamplesPerSecond = 2050
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17923315; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48972ms; SamplesPerSecond = 2041
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17075513; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12160s; TotalTimePerSample = 0.48640ms; SamplesPerSecond = 2055
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14442432; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12216s; TotalTimePerSample = 0.48864ms; SamplesPerSecond = 2046
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17753857; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12283s; TotalTimePerSample = 0.49131ms; SamplesPerSecond = 2035
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15087914; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12183s; TotalTimePerSample = 0.48732ms; SamplesPerSecond = 2052
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19252978; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12265s; TotalTimePerSample = 0.49059ms; SamplesPerSecond = 2038
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17830664; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12267s; TotalTimePerSample = 0.49070ms; SamplesPerSecond = 2037
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15115429; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12254s; TotalTimePerSample = 0.49017ms; SamplesPerSecond = 2040
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19135889; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12210s; TotalTimePerSample = 0.48840ms; SamplesPerSecond = 2047
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.21491407; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12306s; TotalTimePerSample = 0.49222ms; SamplesPerSecond = 2031
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18682373; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12157s; TotalTimePerSample = 0.48627ms; SamplesPerSecond = 2056
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18483251; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12138s; TotalTimePerSample = 0.48552ms; SamplesPerSecond = 2059
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14684522; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15322119; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12308s; TotalTimePerSample = 0.49230ms; SamplesPerSecond = 2031
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19882520; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12374s; TotalTimePerSample = 0.49496ms; SamplesPerSecond = 2020
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13683788; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48972ms; SamplesPerSecond = 2041
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18621191; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12193s; TotalTimePerSample = 0.48772ms; SamplesPerSecond = 2050
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19408056; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12169s; TotalTimePerSample = 0.48674ms; SamplesPerSecond = 2054
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17298096; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12219s; TotalTimePerSample = 0.48878ms; SamplesPerSecond = 2045
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13265137; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12193s; TotalTimePerSample = 0.48773ms; SamplesPerSecond = 2050
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17627051; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12157s; TotalTimePerSample = 0.48630ms; SamplesPerSecond = 2056
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12734570; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12409s; TotalTimePerSample = 0.49636ms; SamplesPerSecond = 2014
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15108399; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12116s; TotalTimePerSample = 0.48465ms; SamplesPerSecond = 2063
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19729199; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12857373; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12203s; TotalTimePerSample = 0.48814ms; SamplesPerSecond = 2048
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13867822; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12274s; TotalTimePerSample = 0.49096ms; SamplesPerSecond = 2036
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12786084; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12172s; TotalTimePerSample = 0.48690ms; SamplesPerSecond = 2053
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16643262; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20440333; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12238s; TotalTimePerSample = 0.48952ms; SamplesPerSecond = 2042
-MPI Rank 0:  Epoch[ 2 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14566259; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12230s; TotalTimePerSample = 0.48921ms; SamplesPerSecond = 2044
-MPI Rank 0: Finished Epoch[ 2 of 10]: [Training Set] TrainLossPerSample = 0.20373113; EvalErrPerSample = 0.082699999; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.92998
-MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.50774607; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.12561s; TotalTimePerSample = 0.50245ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.43388910; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12527s; TotalTimePerSample = 0.50106ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.36674852; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12554s; TotalTimePerSample = 0.50217ms; SamplesPerSecond = 1991
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.33768746; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12523s; TotalTimePerSample = 0.50093ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.30320932; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.29576032; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12563s; TotalTimePerSample = 0.50252ms; SamplesPerSecond = 1989
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24924483; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12499s; TotalTimePerSample = 0.49998ms; SamplesPerSecond = 2000
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24632409; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12555s; TotalTimePerSample = 0.50219ms; SamplesPerSecond = 1991
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20943152; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12526s; TotalTimePerSample = 0.50102ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19115992; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12537s; TotalTimePerSample = 0.50147ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17923227; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50020ms; SamplesPerSecond = 1999
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17075420; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12524s; TotalTimePerSample = 0.50097ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14442369; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17753818; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12514s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15087853; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50084ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19253021; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50135ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12527s; TotalTimePerSample = 0.50109ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12481s; TotalTimePerSample = 0.49925ms; SamplesPerSecond = 2002
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12524s; TotalTimePerSample = 0.50094ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12494s; TotalTimePerSample = 0.49976ms; SamplesPerSecond = 2000
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12540s; TotalTimePerSample = 0.50160ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12495s; TotalTimePerSample = 0.49979ms; SamplesPerSecond = 2000
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14684503; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12495s; TotalTimePerSample = 0.49981ms; SamplesPerSecond = 2000
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15322116; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12531s; TotalTimePerSample = 0.50122ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50092ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13683832; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12499s; TotalTimePerSample = 0.49994ms; SamplesPerSecond = 2000
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18621189; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12490s; TotalTimePerSample = 0.49962ms; SamplesPerSecond = 2001
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19408050; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12603s; TotalTimePerSample = 0.50412ms; SamplesPerSecond = 1983
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12562s; TotalTimePerSample = 0.50249ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50063ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12563s; TotalTimePerSample = 0.50254ms; SamplesPerSecond = 1989
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15108452; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12518s; TotalTimePerSample = 0.50071ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19729185; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12573s; TotalTimePerSample = 0.50293ms; SamplesPerSecond = 1988
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12857333; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12511s; TotalTimePerSample = 0.50046ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12548s; TotalTimePerSample = 0.50194ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12786051; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12555s; TotalTimePerSample = 0.50219ms; SamplesPerSecond = 1991
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12504s; TotalTimePerSample = 0.50016ms; SamplesPerSecond = 1999
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20440408; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12571s; TotalTimePerSample = 0.50285ms; SamplesPerSecond = 1988
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14566237; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12526s; TotalTimePerSample = 0.50105ms; SamplesPerSecond = 1995
+MPI Rank 0: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373026; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.013055
+MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 0: starting epoch 2 at record count 20000, and file position 0
 MPI Rank 0: already there from last epoch
 MPI Rank 0: 
 MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12590086; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12915s; TotalTimePerSample = 0.51660ms; SamplesPerSecond = 1935
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17780226; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12396s; TotalTimePerSample = 0.49586ms; SamplesPerSecond = 2016
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14417633; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12281s; TotalTimePerSample = 0.49125ms; SamplesPerSecond = 2035
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15796880; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12283s; TotalTimePerSample = 0.49131ms; SamplesPerSecond = 2035
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17002991; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12303s; TotalTimePerSample = 0.49212ms; SamplesPerSecond = 2032
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18262109; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12230s; TotalTimePerSample = 0.48918ms; SamplesPerSecond = 2044
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14643688; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12105s; TotalTimePerSample = 0.48420ms; SamplesPerSecond = 2065
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18030518; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12216s; TotalTimePerSample = 0.48862ms; SamplesPerSecond = 2046
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15846142; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12216s; TotalTimePerSample = 0.48865ms; SamplesPerSecond = 2046
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14486536; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12135s; TotalTimePerSample = 0.48540ms; SamplesPerSecond = 2060
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13469091; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12150s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12149s; TotalTimePerSample = 0.48594ms; SamplesPerSecond = 2057
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.11641297; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12268s; TotalTimePerSample = 0.49073ms; SamplesPerSecond = 2037
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16786633; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12217s; TotalTimePerSample = 0.48868ms; SamplesPerSecond = 2046
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12811548; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17257836; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12253s; TotalTimePerSample = 0.49013ms; SamplesPerSecond = 2040
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17623682; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12012s; TotalTimePerSample = 0.48046ms; SamplesPerSecond = 2081
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14121118; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12139s; TotalTimePerSample = 0.48557ms; SamplesPerSecond = 2059
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19243409; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12070s; TotalTimePerSample = 0.48279ms; SamplesPerSecond = 2071
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20908155; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12150s; TotalTimePerSample = 0.48600ms; SamplesPerSecond = 2057
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18472095; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12150s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18185547; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12188s; TotalTimePerSample = 0.48750ms; SamplesPerSecond = 2051
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14074194; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12174s; TotalTimePerSample = 0.48698ms; SamplesPerSecond = 2053
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14871632; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12170s; TotalTimePerSample = 0.48680ms; SamplesPerSecond = 2054
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20299682; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12216s; TotalTimePerSample = 0.48864ms; SamplesPerSecond = 2046
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12852076; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12055s; TotalTimePerSample = 0.48221ms; SamplesPerSecond = 2073
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18660498; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12199s; TotalTimePerSample = 0.48796ms; SamplesPerSecond = 2049
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19576025; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12240s; TotalTimePerSample = 0.48961ms; SamplesPerSecond = 2042
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16667627; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12526172; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12060s; TotalTimePerSample = 0.48238ms; SamplesPerSecond = 2073
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17391992; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12106s; TotalTimePerSample = 0.48423ms; SamplesPerSecond = 2065
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12281641; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48570ms; SamplesPerSecond = 2058
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14759424; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12211s; TotalTimePerSample = 0.48846ms; SamplesPerSecond = 2047
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19801368; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12210s; TotalTimePerSample = 0.48840ms; SamplesPerSecond = 2047
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12593359; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48971ms; SamplesPerSecond = 2042
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13756640; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12192s; TotalTimePerSample = 0.48767ms; SamplesPerSecond = 2050
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48976ms; SamplesPerSecond = 2041
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16654395; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12274s; TotalTimePerSample = 0.49097ms; SamplesPerSecond = 2036
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20658936; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12091s; TotalTimePerSample = 0.48365ms; SamplesPerSecond = 2067
-MPI Rank 0:  Epoch[ 3 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14583300; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48834ms; SamplesPerSecond = 2047
-MPI Rank 0: Finished Epoch[ 3 of 10]: [Training Set] TrainLossPerSample = 0.15948617; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.919512
-MPI Rank 0: Starting Epoch 4: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12528s; TotalTimePerSample = 0.50112ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17780230; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12502s; TotalTimePerSample = 0.50008ms; SamplesPerSecond = 1999
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12568s; TotalTimePerSample = 0.50270ms; SamplesPerSecond = 1989
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15796896; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12536s; TotalTimePerSample = 0.50144ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17003000; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12538s; TotalTimePerSample = 0.50150ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50135ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14643695; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12565s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12593s; TotalTimePerSample = 0.50374ms; SamplesPerSecond = 1985
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12554s; TotalTimePerSample = 0.50218ms; SamplesPerSecond = 1991
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12554s; TotalTimePerSample = 0.50218ms; SamplesPerSecond = 1991
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13469094; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12553s; TotalTimePerSample = 0.50213ms; SamplesPerSecond = 1991
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12528s; TotalTimePerSample = 0.50113ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12520s; TotalTimePerSample = 0.50080ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12589s; TotalTimePerSample = 0.50356ms; SamplesPerSecond = 1985
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12542s; TotalTimePerSample = 0.50168ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12521s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19243442; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12554s; TotalTimePerSample = 0.50215ms; SamplesPerSecond = 1991
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12517s; TotalTimePerSample = 0.50070ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12558s; TotalTimePerSample = 0.50231ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18185536; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12517s; TotalTimePerSample = 0.50067ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14074204; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12509s; TotalTimePerSample = 0.50034ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20299705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12563s; TotalTimePerSample = 0.50250ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12852037; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18660440; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12532s; TotalTimePerSample = 0.50126ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12488s; TotalTimePerSample = 0.49953ms; SamplesPerSecond = 2001
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12595s; TotalTimePerSample = 0.50380ms; SamplesPerSecond = 1984
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12526168; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12516s; TotalTimePerSample = 0.50065ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17392133; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12557s; TotalTimePerSample = 0.50228ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12509s; TotalTimePerSample = 0.50035ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12529s; TotalTimePerSample = 0.50114ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12507s; TotalTimePerSample = 0.50028ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50136ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50136ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16654368; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12549s; TotalTimePerSample = 0.50197ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20658950; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12566s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989
+MPI Rank 0: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.016251
+MPI Rank 0: Starting Epoch 4: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 0: starting epoch 3 at record count 30000, and file position 0
 MPI Rank 0: already there from last epoch
 MPI Rank 0: 
 MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12758s; TotalTimePerSample = 0.51030ms; SamplesPerSecond = 1959
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12300s; TotalTimePerSample = 0.49200ms; SamplesPerSecond = 2032
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14239721; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12166s; TotalTimePerSample = 0.48662ms; SamplesPerSecond = 2054
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15630139; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12203s; TotalTimePerSample = 0.48814ms; SamplesPerSecond = 2048
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16935523; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12092s; TotalTimePerSample = 0.48370ms; SamplesPerSecond = 2067
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18198816; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12086s; TotalTimePerSample = 0.48344ms; SamplesPerSecond = 2068
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14475952; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12254s; TotalTimePerSample = 0.49015ms; SamplesPerSecond = 2040
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18021594; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12209s; TotalTimePerSample = 0.48835ms; SamplesPerSecond = 2047
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15849304; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12486s; TotalTimePerSample = 0.49944ms; SamplesPerSecond = 2002
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14474402; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48872ms; SamplesPerSecond = 2046
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13362928; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12226s; TotalTimePerSample = 0.48906ms; SamplesPerSecond = 2044
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13708325; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12182s; TotalTimePerSample = 0.48729ms; SamplesPerSecond = 2052
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.11569763; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12245s; TotalTimePerSample = 0.48979ms; SamplesPerSecond = 2041
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16892321; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12148s; TotalTimePerSample = 0.48593ms; SamplesPerSecond = 2057
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12752125; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12131s; TotalTimePerSample = 0.48522ms; SamplesPerSecond = 2060
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17100880; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12211s; TotalTimePerSample = 0.48845ms; SamplesPerSecond = 2047
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17660449; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12219s; TotalTimePerSample = 0.48878ms; SamplesPerSecond = 2045
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14105836; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12170s; TotalTimePerSample = 0.48681ms; SamplesPerSecond = 2054
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19333544; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12124s; TotalTimePerSample = 0.48496ms; SamplesPerSecond = 2062
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20859498; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12071s; TotalTimePerSample = 0.48282ms; SamplesPerSecond = 2071
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18499707; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12381s; TotalTimePerSample = 0.49524ms; SamplesPerSecond = 2019
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18152441; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12164s; TotalTimePerSample = 0.48656ms; SamplesPerSecond = 2055
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14037134; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12261s; TotalTimePerSample = 0.49044ms; SamplesPerSecond = 2038
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14866894; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12391s; TotalTimePerSample = 0.49566ms; SamplesPerSecond = 2017
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20347705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12231s; TotalTimePerSample = 0.48922ms; SamplesPerSecond = 2044
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12815039; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12248s; TotalTimePerSample = 0.48991ms; SamplesPerSecond = 2041
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18672803; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12289s; TotalTimePerSample = 0.49156ms; SamplesPerSecond = 2034
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19552930; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16452637; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12195s; TotalTimePerSample = 0.48782ms; SamplesPerSecond = 2049
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12461865; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12028s; TotalTimePerSample = 0.48113ms; SamplesPerSecond = 2078
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17285107; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12185s; TotalTimePerSample = 0.48742ms; SamplesPerSecond = 2051
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12253613; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12355s; TotalTimePerSample = 0.49419ms; SamplesPerSecond = 2023
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14723291; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12238s; TotalTimePerSample = 0.48954ms; SamplesPerSecond = 2042
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19789551; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12366s; TotalTimePerSample = 0.49465ms; SamplesPerSecond = 2021
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12575878; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12262s; TotalTimePerSample = 0.49046ms; SamplesPerSecond = 2038
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13745947; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12227s; TotalTimePerSample = 0.48906ms; SamplesPerSecond = 2044
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12839746; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12155s; TotalTimePerSample = 0.48621ms; SamplesPerSecond = 2056
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16647315; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20679444; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12144s; TotalTimePerSample = 0.48576ms; SamplesPerSecond = 2058
-MPI Rank 0:  Epoch[ 4 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14585204; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12219s; TotalTimePerSample = 0.48876ms; SamplesPerSecond = 2045
-MPI Rank 0: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15914927; EvalErrPerSample = 0.076700002; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.927898
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50158ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18070514; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12543s; TotalTimePerSample = 0.50172ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14239731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12541s; TotalTimePerSample = 0.50163ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12543s; TotalTimePerSample = 0.50174ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12546s; TotalTimePerSample = 0.50182ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12534s; TotalTimePerSample = 0.50134ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18021602; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12470s; TotalTimePerSample = 0.49880ms; SamplesPerSecond = 2004
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12530s; TotalTimePerSample = 0.50120ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14474426; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12567s; TotalTimePerSample = 0.50270ms; SamplesPerSecond = 1989
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12452s; TotalTimePerSample = 0.49810ms; SamplesPerSecond = 2007
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13708300; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12525s; TotalTimePerSample = 0.50101ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11569776; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12514s; TotalTimePerSample = 0.50057ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16892330; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12558s; TotalTimePerSample = 0.50232ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12752163; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12515s; TotalTimePerSample = 0.50060ms; SamplesPerSecond = 1997
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12500s; TotalTimePerSample = 0.50000ms; SamplesPerSecond = 1999
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14105804; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12543s; TotalTimePerSample = 0.50173ms; SamplesPerSecond = 1993
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19333553; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12536s; TotalTimePerSample = 0.50145ms; SamplesPerSecond = 1994
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20859525; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12520s; TotalTimePerSample = 0.50081ms; SamplesPerSecond = 1996
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12599s; TotalTimePerSample = 0.50396ms; SamplesPerSecond = 1984
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50186ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14037158; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12545s; TotalTimePerSample = 0.50178ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50189ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20347748; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50235ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12815013; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12526s; TotalTimePerSample = 0.50102ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12549s; TotalTimePerSample = 0.50196ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19552989; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12496s; TotalTimePerSample = 0.49984ms; SamplesPerSecond = 2000
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12577s; TotalTimePerSample = 0.50309ms; SamplesPerSecond = 1987
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14723333; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12547s; TotalTimePerSample = 0.50189ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12617s; TotalTimePerSample = 0.50469ms; SamplesPerSecond = 1981
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12609s; TotalTimePerSample = 0.50438ms; SamplesPerSecond = 1982
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12558s; TotalTimePerSample = 0.50230ms; SamplesPerSecond = 1990
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12498s; TotalTimePerSample = 0.49990ms; SamplesPerSecond = 2000
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12548s; TotalTimePerSample = 0.50191ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12528s; TotalTimePerSample = 0.50111ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12570s; TotalTimePerSample = 0.50281ms; SamplesPerSecond = 1988
+MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.018144
+MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
 MPI Rank 0: COMPLETED
 MPI Rank 0: ~MPIWrapper
-MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1
-MPI Rank 1: -------------------------------------------------------------------
-MPI Rank 1: Build info: 
-MPI Rank 1: 
-MPI Rank 1: 		Built time: Aug 25 2015 17:44:46
-MPI Rank 1: 		Last modified date: Mon Aug 24 16:38:42 2015
-MPI Rank 1: 		Built by amitaga on Amitaga-Win-DT3           
-MPI Rank 1: 		Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\
-MPI Rank 1: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
-MPI Rank 1: -------------------------------------------------------------------
-MPI Rank 1: running on Amitaga-Win-DT3 at 2015/08/26 01:48:43
-MPI Rank 1: command line options: 
-MPI Rank 1: configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] 
+MPI Rank 1: running on localhost at 2015/10/24 12:44:54
+MPI Rank 1: command line: 
+MPI Rank 1: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr 
 MPI Rank 1: 
 MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 1: deviceId=$DeviceId$
@@ -544,12 +663,11 @@ MPI Rank 1:         minibatchSize=25
 MPI Rank 1:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 1:         momentumPerMB=0.9
 MPI Rank 1:         dropoutRate=0.0
-MPI Rank 1:         maxEpochs=10
+MPI Rank 1:         maxEpochs=4
 MPI Rank 1:         ParallelTrain=[
 MPI Rank 1:             parallelizationMethod=DataParallelSGD
 MPI Rank 1:             DataParallelSGD=[
 MPI Rank 1:               gradientBits=1
-MPI Rank 1:               parallelizationStartEpoch=1
 MPI Rank 1:             ]
 MPI Rank 1:         ]
 MPI Rank 1:     ]
@@ -571,12 +689,13 @@ MPI Rank 1:         labelMappingFile=$DataDir$/SimpleMapping.txt
 MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ]
-MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
-MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 1: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 1: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
+MPI Rank 1: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
 MPI Rank 1: DeviceId=0
-MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 1: precision=float
 MPI Rank 1: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 1: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 1: 
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: 
@@ -587,7 +706,7 @@ MPI Rank 1: precision=float
 MPI Rank 1: parallelTrain=true
 MPI Rank 1: SimpleMultiGPU=[
 MPI Rank 1:     action=train
-MPI Rank 1:     modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 1:     modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
 MPI Rank 1:     deviceId=0
 MPI Rank 1:     traceLevel=1
 MPI Rank 1:     SimpleNetworkBuilder=[
@@ -606,18 +725,17 @@ MPI Rank 1:         minibatchSize=25
 MPI Rank 1:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 1:         momentumPerMB=0.9
 MPI Rank 1:         dropoutRate=0.0
-MPI Rank 1:         maxEpochs=10
+MPI Rank 1:         maxEpochs=4
 MPI Rank 1:         ParallelTrain=[
 MPI Rank 1:             parallelizationMethod=DataParallelSGD
 MPI Rank 1:             DataParallelSGD=[
 MPI Rank 1:               gradientBits=1
-MPI Rank 1:               parallelizationStartEpoch=1
 MPI Rank 1:             ]
 MPI Rank 1:         ]
 MPI Rank 1:     ]
 MPI Rank 1:     reader=[
 MPI Rank 1:       readerType=UCIFastReader
-MPI Rank 1:       file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 1:       file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
 MPI Rank 1:       miniBatchMode=Partial
 MPI Rank 1:       randomize=None
 MPI Rank 1:       verbosity=1   
@@ -629,29 +747,31 @@ MPI Rank 1:       labels=[
 MPI Rank 1: start=2      
 MPI Rank 1: dim=1        
 MPI Rank 1: labelDim=2   
-MPI Rank 1:         labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 1:         labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt
 MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ]
-MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
-MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 1: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 1: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
+MPI Rank 1: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
 MPI Rank 1: DeviceId=0
-MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 1: precision=float
 MPI Rank 1: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 1: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 1: 
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: 
 MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 1: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU
-MPI Rank 1: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 1: configparameters: SimpleMultiGPU.config:ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
+MPI Rank 1: configparameters: SimpleMultiGPU.config:DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
 MPI Rank 1: configparameters: SimpleMultiGPU.config:deviceId=0
 MPI Rank 1: configparameters: SimpleMultiGPU.config:parallelTrain=true
 MPI Rank 1: configparameters: SimpleMultiGPU.config:precision=float
-MPI Rank 1: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 1: configparameters: SimpleMultiGPU.config:RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
 MPI Rank 1: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[
 MPI Rank 1:     action=train
-MPI Rank 1:     modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 1:     modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
 MPI Rank 1:     deviceId=0
 MPI Rank 1:     traceLevel=1
 MPI Rank 1:     SimpleNetworkBuilder=[
@@ -670,18 +790,17 @@ MPI Rank 1:         minibatchSize=25
 MPI Rank 1:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 1:         momentumPerMB=0.9
 MPI Rank 1:         dropoutRate=0.0
-MPI Rank 1:         maxEpochs=10
+MPI Rank 1:         maxEpochs=4
 MPI Rank 1:         ParallelTrain=[
 MPI Rank 1:             parallelizationMethod=DataParallelSGD
 MPI Rank 1:             DataParallelSGD=[
 MPI Rank 1:               gradientBits=1
-MPI Rank 1:               parallelizationStartEpoch=1
 MPI Rank 1:             ]
 MPI Rank 1:         ]
 MPI Rank 1:     ]
 MPI Rank 1:     reader=[
 MPI Rank 1:       readerType=UCIFastReader
-MPI Rank 1:       file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 1:       file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
 MPI Rank 1:       miniBatchMode=Partial
 MPI Rank 1:       randomize=None
 MPI Rank 1:       verbosity=1   
@@ -693,45 +812,100 @@ MPI Rank 1:       labels=[
 MPI Rank 1: start=2      
 MPI Rank 1: dim=1        
 MPI Rank 1: labelDim=2   
-MPI Rank 1:         labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 1:         labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt
 MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
 MPI Rank 1: 
-MPI Rank 1: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 1: configparameters: SimpleMultiGPU.config:stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: command: SimpleMultiGPU 
 MPI Rank 1: precision = float
+MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 1: CNTKCommandTrainInfo: SimpleMultiGPU : 4
+MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+MPI Rank 1: CNTKCommandTrainBegin: SimpleMultiGPU
 MPI Rank 1: SimpleNetworkBuilder Using GPU 0
-MPI Rank 1: reading uci file E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 1: reading uci file /home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
+MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 1: GetTrainCriterionNodes  ...
 MPI Rank 1: GetEvalCriterionNodes  ...
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node CrossEntropyWithSoftmax 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 1: 
-MPI Rank 1: Validating --> labels = InputValue
-MPI Rank 1: Validating --> W2 = LearnableParameter
-MPI Rank 1: Validating --> W1 = LearnableParameter
-MPI Rank 1: Validating --> W0 = LearnableParameter
-MPI Rank 1: Validating --> features = InputValue
-MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, 3])
-MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, 3])
-MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1])
-MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 3])
-MPI Rank 1: Validating --> B0 = LearnableParameter
-MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, 3], B0[50, 1])
-MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, 3])
-MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 3])
-MPI Rank 1: Validating --> B1 = LearnableParameter
-MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, 3], B1[50, 1])
-MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, 3])
-MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 3])
-MPI Rank 1: Validating --> B2 = LearnableParameter
-MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, 3], B2[2, 1])
-MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, 3], HLast[2, 3])
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Precomputing --> 3 PreCompute nodes found.
 MPI Rank 1: 
-MPI Rank 1: Found 3 PreCompute nodes
 MPI Rank 1: 	NodeName: InvStdOfFeatures
 MPI Rank 1: 	NodeName: MeanOfFeatures
 MPI Rank 1: 	NodeName: Prior
@@ -742,250 +916,320 @@ MPI Rank 1: starting epoch 0 at record count 0, and file position 0
 MPI Rank 1: already there from last epoch
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node InvStdOfFeatures 
+MPI Rank 1: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 1: 
-MPI Rank 1: Validating --> features = InputValue
-MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, 25])
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node InvStdOfFeatures, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 1: 
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node MeanOfFeatures 
+MPI Rank 1: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 1: 
-MPI Rank 1: Validating --> features = InputValue
-MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, 25])
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node MeanOfFeatures, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 1: 
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node Prior 
+MPI Rank 1: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 1: 
-MPI Rank 1: Validating --> labels = InputValue
-MPI Rank 1: Validating --> Prior = Mean(labels[2, 25])
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node Prior, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+MPI Rank 1: 
+MPI Rank 1: Precomputing --> Completed.
 MPI Rank 1: 
 MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.020000  momentum = 0.900001 
+MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000 
 MPI Rank 1: starting epoch 0 at record count 0, and file position 0
 MPI Rank 1: already there from last epoch
 MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: 
 MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 1: 
-MPI Rank 1: 
-MPI Rank 1: Validating node EvalErrorPrediction 
-MPI Rank 1: 
-MPI Rank 1: Validating --> labels = InputValue
-MPI Rank 1: Validating --> W2 = LearnableParameter
-MPI Rank 1: Validating --> W1 = LearnableParameter
-MPI Rank 1: Validating --> W0 = LearnableParameter
-MPI Rank 1: Validating --> features = InputValue
-MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, 6])
-MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, 6])
-MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 6], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1])
-MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 6])
-MPI Rank 1: Validating --> B0 = LearnableParameter
-MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, 6], B0[50, 1])
-MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, 6])
-MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 6])
-MPI Rank 1: Validating --> B1 = LearnableParameter
-MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, 6], B1[50, 1])
-MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, 6])
-MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 6])
-MPI Rank 1: Validating --> B2 = LearnableParameter
-MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, 6], B2[2, 1])
-MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, 6], HLast[2, 6])
-MPI Rank 1: 
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70007980; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.19936s; TotalTimePerSample = 0.79746ms; SamplesPerSecond = 1253
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71514523; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.15552s; TotalTimePerSample = 0.62208ms; SamplesPerSecond = 1607
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.14887s; TotalTimePerSample = 0.59550ms; SamplesPerSecond = 1679
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70079005; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.14476s; TotalTimePerSample = 0.57905ms; SamplesPerSecond = 1726
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70605618; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.14228s; TotalTimePerSample = 0.56912ms; SamplesPerSecond = 1757
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71572435; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13675s; TotalTimePerSample = 0.54699ms; SamplesPerSecond = 1828
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72149903; EvalErr[0]PerSample = 0.47999999; TotalTime = 0.13631s; TotalTimePerSample = 0.54524ms; SamplesPerSecond = 1834
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.79845655; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13435s; TotalTimePerSample = 0.53738ms; SamplesPerSecond = 1860
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69665188; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.13044s; TotalTimePerSample = 0.52174ms; SamplesPerSecond = 1916
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70723337; EvalErr[0]PerSample = 0.49200001; TotalTime = 0.12786s; TotalTimePerSample = 0.51146ms; SamplesPerSecond = 1955
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71420360; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12631s; TotalTimePerSample = 0.50524ms; SamplesPerSecond = 1979
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69535255; EvalErr[0]PerSample = 0.43599999; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70078611; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12261s; TotalTimePerSample = 0.49046ms; SamplesPerSecond = 2038
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71857810; EvalErr[0]PerSample = 0.54799998; TotalTime = 0.12293s; TotalTimePerSample = 0.49171ms; SamplesPerSecond = 2033
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72088283; EvalErr[0]PerSample = 0.48800001; TotalTime = 0.12233s; TotalTimePerSample = 0.48930ms; SamplesPerSecond = 2043
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71798825; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12123s; TotalTimePerSample = 0.48494ms; SamplesPerSecond = 2062
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.74162209; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12330s; TotalTimePerSample = 0.49320ms; SamplesPerSecond = 2027
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71835059; EvalErr[0]PerSample = 0.51599997; TotalTime = 0.12340s; TotalTimePerSample = 0.49358ms; SamplesPerSecond = 2025
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71529394; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12334s; TotalTimePerSample = 0.49336ms; SamplesPerSecond = 2026
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71727639; EvalErr[0]PerSample = 0.53200001; TotalTime = 0.12473s; TotalTimePerSample = 0.49890ms; SamplesPerSecond = 2004
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71745312; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12362s; TotalTimePerSample = 0.49447ms; SamplesPerSecond = 2022
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72088087; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12197s; TotalTimePerSample = 0.48789ms; SamplesPerSecond = 2049
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72006541; EvalErr[0]PerSample = 0.50800002; TotalTime = 0.12266s; TotalTimePerSample = 0.49064ms; SamplesPerSecond = 2038
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71275192; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12162s; TotalTimePerSample = 0.48650ms; SamplesPerSecond = 2055
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69645119; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12122s; TotalTimePerSample = 0.48486ms; SamplesPerSecond = 2062
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70129883; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12089s; TotalTimePerSample = 0.48356ms; SamplesPerSecond = 2068
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70768166; EvalErr[0]PerSample = 0.54400003; TotalTime = 0.12174s; TotalTimePerSample = 0.48697ms; SamplesPerSecond = 2053
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69744140; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69266015; EvalErr[0]PerSample = 0.44800001; TotalTime = 0.12181s; TotalTimePerSample = 0.48722ms; SamplesPerSecond = 2052
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69347072; EvalErr[0]PerSample = 0.49599999; TotalTime = 0.12233s; TotalTimePerSample = 0.48930ms; SamplesPerSecond = 2043
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69257420; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.12288s; TotalTimePerSample = 0.49154ms; SamplesPerSecond = 2034
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.68625975; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12417s; TotalTimePerSample = 0.49669ms; SamplesPerSecond = 2013
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69064063; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.12339s; TotalTimePerSample = 0.49354ms; SamplesPerSecond = 2026
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70192385; EvalErr[0]PerSample = 0.46000001; TotalTime = 0.12176s; TotalTimePerSample = 0.48704ms; SamplesPerSecond = 2053
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69058985; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.12238s; TotalTimePerSample = 0.48952ms; SamplesPerSecond = 2042
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.67041212; EvalErr[0]PerSample = 0.39199999; TotalTime = 0.12186s; TotalTimePerSample = 0.48744ms; SamplesPerSecond = 2051
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.65914255; EvalErr[0]PerSample = 0.35600001; TotalTime = 0.12264s; TotalTimePerSample = 0.49054ms; SamplesPerSecond = 2038
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.63919920; EvalErr[0]PerSample = 0.36399999; TotalTime = 0.12265s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.61294138; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12143s; TotalTimePerSample = 0.48574ms; SamplesPerSecond = 2058
-MPI Rank 1:  Epoch[ 1 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.55255663; EvalErr[0]PerSample = 0.18799999; TotalTime = 0.12034s; TotalTimePerSample = 0.48136ms; SamplesPerSecond = 2077
-MPI Rank 1: Finished Epoch[ 1 of 10]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.47350001; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.253972
-MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 1: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal.
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[   1-  10 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12718s; TotalTimePerSample = 0.50872ms; SamplesPerSecond = 1965
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  11-  20 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12478s; TotalTimePerSample = 0.49911ms; SamplesPerSecond = 2003
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  21-  30 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12484s; TotalTimePerSample = 0.49937ms; SamplesPerSecond = 2002
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  31-  40 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12471s; TotalTimePerSample = 0.49883ms; SamplesPerSecond = 2004
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  41-  50 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70605616; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12439s; TotalTimePerSample = 0.49754ms; SamplesPerSecond = 2009
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  51-  60 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12468s; TotalTimePerSample = 0.49870ms; SamplesPerSecond = 2005
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  61-  70 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.12451s; TotalTimePerSample = 0.49804ms; SamplesPerSecond = 2007
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  71-  80 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.79845605; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12436s; TotalTimePerSample = 0.49745ms; SamplesPerSecond = 2010
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  81-  90 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12441s; TotalTimePerSample = 0.49766ms; SamplesPerSecond = 2009
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50020ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71420344; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12494s; TotalTimePerSample = 0.49974ms; SamplesPerSecond = 2001
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12504s; TotalTimePerSample = 0.50015ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12504s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71857915; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.12470s; TotalTimePerSample = 0.49879ms; SamplesPerSecond = 2004
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.12489s; TotalTimePerSample = 0.49955ms; SamplesPerSecond = 2001
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71798839; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12496s; TotalTimePerSample = 0.49986ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12439s; TotalTimePerSample = 0.49757ms; SamplesPerSecond = 2009
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71835127; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.12480s; TotalTimePerSample = 0.49921ms; SamplesPerSecond = 2003
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71529462; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12442s; TotalTimePerSample = 0.49767ms; SamplesPerSecond = 2009
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71727656; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12438s; TotalTimePerSample = 0.49752ms; SamplesPerSecond = 2009
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12477s; TotalTimePerSample = 0.49908ms; SamplesPerSecond = 2003
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088397; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12431s; TotalTimePerSample = 0.49724ms; SamplesPerSecond = 2011
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72006808; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12485s; TotalTimePerSample = 0.49941ms; SamplesPerSecond = 2002
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50034ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12464s; TotalTimePerSample = 0.49854ms; SamplesPerSecond = 2005
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70129697; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12495s; TotalTimePerSample = 0.49980ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12487s; TotalTimePerSample = 0.49948ms; SamplesPerSecond = 2002
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12557s; TotalTimePerSample = 0.50229ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69266186; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.12540s; TotalTimePerSample = 0.50162ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12498s; TotalTimePerSample = 0.49991ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69257410; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12543s; TotalTimePerSample = 0.50172ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12516s; TotalTimePerSample = 0.50066ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12542s; TotalTimePerSample = 0.50169ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70192153; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12511s; TotalTimePerSample = 0.50042ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50205ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.67041489; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12580s; TotalTimePerSample = 0.50319ms; SamplesPerSecond = 1987
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.65913971; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50137ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.63919874; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.12514s; TotalTimePerSample = 0.50057ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.61293878; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12522s; TotalTimePerSample = 0.50088ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.55255340; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.12484s; TotalTimePerSample = 0.49937ms; SamplesPerSecond = 2002
+MPI Rank 1: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.008953
+MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 1: starting epoch 1 at record count 10000, and file position 0
 MPI Rank 1: already there from last epoch
 MPI Rank 1: 
 MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.50775200; EvalErr[0]PerSample = 0.23999999; TotalTime = 0.12819s; TotalTimePerSample = 0.51274ms; SamplesPerSecond = 1950
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.43389454; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12189s; TotalTimePerSample = 0.48755ms; SamplesPerSecond = 2051
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.36675408; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12077s; TotalTimePerSample = 0.48309ms; SamplesPerSecond = 2070
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.33769274; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12255s; TotalTimePerSample = 0.49019ms; SamplesPerSecond = 2040
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.30321363; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12133s; TotalTimePerSample = 0.48531ms; SamplesPerSecond = 2060
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.29576379; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12054s; TotalTimePerSample = 0.48217ms; SamplesPerSecond = 2073
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.24924731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.24632569; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12330s; TotalTimePerSample = 0.49319ms; SamplesPerSecond = 2027
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20943311; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12290s; TotalTimePerSample = 0.49159ms; SamplesPerSecond = 2034
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19116065; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12191s; TotalTimePerSample = 0.48764ms; SamplesPerSecond = 2050
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17923315; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48973ms; SamplesPerSecond = 2041
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17075513; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12160s; TotalTimePerSample = 0.48641ms; SamplesPerSecond = 2055
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14442432; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12216s; TotalTimePerSample = 0.48864ms; SamplesPerSecond = 2046
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17753857; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12282s; TotalTimePerSample = 0.49126ms; SamplesPerSecond = 2035
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15087914; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12184s; TotalTimePerSample = 0.48735ms; SamplesPerSecond = 2051
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19252978; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12266s; TotalTimePerSample = 0.49063ms; SamplesPerSecond = 2038
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17830664; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12268s; TotalTimePerSample = 0.49071ms; SamplesPerSecond = 2037
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15115429; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12258s; TotalTimePerSample = 0.49032ms; SamplesPerSecond = 2039
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19135889; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12208s; TotalTimePerSample = 0.48832ms; SamplesPerSecond = 2047
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.21491407; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12305s; TotalTimePerSample = 0.49221ms; SamplesPerSecond = 2031
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18682373; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12157s; TotalTimePerSample = 0.48628ms; SamplesPerSecond = 2056
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18483251; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12136s; TotalTimePerSample = 0.48545ms; SamplesPerSecond = 2059
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14684522; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15322119; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12309s; TotalTimePerSample = 0.49234ms; SamplesPerSecond = 2031
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19882520; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12375s; TotalTimePerSample = 0.49499ms; SamplesPerSecond = 2020
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13683788; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48973ms; SamplesPerSecond = 2041
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18621191; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12194s; TotalTimePerSample = 0.48776ms; SamplesPerSecond = 2050
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19408056; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12169s; TotalTimePerSample = 0.48674ms; SamplesPerSecond = 2054
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17298096; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12219s; TotalTimePerSample = 0.48877ms; SamplesPerSecond = 2045
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13265137; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12194s; TotalTimePerSample = 0.48774ms; SamplesPerSecond = 2050
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17627051; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12157s; TotalTimePerSample = 0.48627ms; SamplesPerSecond = 2056
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12734570; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12411s; TotalTimePerSample = 0.49643ms; SamplesPerSecond = 2014
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15108399; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12117s; TotalTimePerSample = 0.48467ms; SamplesPerSecond = 2063
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19729199; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12269s; TotalTimePerSample = 0.49077ms; SamplesPerSecond = 2037
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12857373; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12203s; TotalTimePerSample = 0.48814ms; SamplesPerSecond = 2048
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13867822; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12273s; TotalTimePerSample = 0.49094ms; SamplesPerSecond = 2036
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12786084; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12171s; TotalTimePerSample = 0.48684ms; SamplesPerSecond = 2054
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16643262; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12108s; TotalTimePerSample = 0.48434ms; SamplesPerSecond = 2064
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20440333; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12239s; TotalTimePerSample = 0.48954ms; SamplesPerSecond = 2042
-MPI Rank 1:  Epoch[ 2 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14566259; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12218s; TotalTimePerSample = 0.48873ms; SamplesPerSecond = 2046
-MPI Rank 1: Finished Epoch[ 2 of 10]: [Training Set] TrainLossPerSample = 0.20373113; EvalErrPerSample = 0.082699999; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.929569
-MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.50774607; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.12601s; TotalTimePerSample = 0.50405ms; SamplesPerSecond = 1983
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.43388910; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12544s; TotalTimePerSample = 0.50174ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.36674852; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12539s; TotalTimePerSample = 0.50155ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.33768746; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12522s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.30320932; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12500s; TotalTimePerSample = 0.49999ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.29576032; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12561s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24924483; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12491s; TotalTimePerSample = 0.49963ms; SamplesPerSecond = 2001
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24632409; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12565s; TotalTimePerSample = 0.50259ms; SamplesPerSecond = 1989
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20943152; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50078ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19115992; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12536s; TotalTimePerSample = 0.50143ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17923227; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17075420; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12523s; TotalTimePerSample = 0.50092ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14442369; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12530s; TotalTimePerSample = 0.50118ms; SamplesPerSecond = 1995
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17753818; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15087853; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19253021; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12531s; TotalTimePerSample = 0.50123ms; SamplesPerSecond = 1995
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12532s; TotalTimePerSample = 0.50130ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12474s; TotalTimePerSample = 0.49895ms; SamplesPerSecond = 2004
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12524s; TotalTimePerSample = 0.50095ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12497s; TotalTimePerSample = 0.49987ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12528s; TotalTimePerSample = 0.50113ms; SamplesPerSecond = 1995
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12501s; TotalTimePerSample = 0.50002ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14684503; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12495s; TotalTimePerSample = 0.49978ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15322116; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12527s; TotalTimePerSample = 0.50109ms; SamplesPerSecond = 1995
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12518s; TotalTimePerSample = 0.50072ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13683832; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12507s; TotalTimePerSample = 0.50028ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18621189; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12489s; TotalTimePerSample = 0.49957ms; SamplesPerSecond = 2001
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19408050; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12602s; TotalTimePerSample = 0.50407ms; SamplesPerSecond = 1983
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50063ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50245ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50038ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12580s; TotalTimePerSample = 0.50321ms; SamplesPerSecond = 1987
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15108452; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12484s; TotalTimePerSample = 0.49936ms; SamplesPerSecond = 2002
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19729185; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12592s; TotalTimePerSample = 0.50368ms; SamplesPerSecond = 1985
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12857333; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50020ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12550s; TotalTimePerSample = 0.50201ms; SamplesPerSecond = 1992
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12786051; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12550s; TotalTimePerSample = 0.50202ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12504s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20440408; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12576s; TotalTimePerSample = 0.50303ms; SamplesPerSecond = 1987
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14566237; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12516s; TotalTimePerSample = 0.50066ms; SamplesPerSecond = 1997
+MPI Rank 1: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373026; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.013461
+MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 1: starting epoch 2 at record count 20000, and file position 0
 MPI Rank 1: already there from last epoch
 MPI Rank 1: 
 MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12590086; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12914s; TotalTimePerSample = 0.51656ms; SamplesPerSecond = 1935
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17780226; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12397s; TotalTimePerSample = 0.49587ms; SamplesPerSecond = 2016
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14417633; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12282s; TotalTimePerSample = 0.49128ms; SamplesPerSecond = 2035
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15796880; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12283s; TotalTimePerSample = 0.49133ms; SamplesPerSecond = 2035
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17002991; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12300s; TotalTimePerSample = 0.49200ms; SamplesPerSecond = 2032
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18262109; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12231s; TotalTimePerSample = 0.48924ms; SamplesPerSecond = 2044
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14643688; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12105s; TotalTimePerSample = 0.48419ms; SamplesPerSecond = 2065
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18030518; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12216s; TotalTimePerSample = 0.48862ms; SamplesPerSecond = 2046
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15846142; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12217s; TotalTimePerSample = 0.48869ms; SamplesPerSecond = 2046
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14486536; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12135s; TotalTimePerSample = 0.48540ms; SamplesPerSecond = 2060
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13469091; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12151s; TotalTimePerSample = 0.48603ms; SamplesPerSecond = 2057
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12148s; TotalTimePerSample = 0.48593ms; SamplesPerSecond = 2057
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.11641297; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12268s; TotalTimePerSample = 0.49074ms; SamplesPerSecond = 2037
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16786633; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12218s; TotalTimePerSample = 0.48871ms; SamplesPerSecond = 2046
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12811548; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17257836; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12252s; TotalTimePerSample = 0.49008ms; SamplesPerSecond = 2040
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17623682; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12011s; TotalTimePerSample = 0.48046ms; SamplesPerSecond = 2081
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14121118; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12138s; TotalTimePerSample = 0.48552ms; SamplesPerSecond = 2059
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19243409; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12070s; TotalTimePerSample = 0.48282ms; SamplesPerSecond = 2071
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20908155; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12151s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18472095; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12151s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18185547; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12187s; TotalTimePerSample = 0.48750ms; SamplesPerSecond = 2051
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14074194; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12175s; TotalTimePerSample = 0.48701ms; SamplesPerSecond = 2053
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14871632; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12169s; TotalTimePerSample = 0.48676ms; SamplesPerSecond = 2054
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20299682; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12217s; TotalTimePerSample = 0.48868ms; SamplesPerSecond = 2046
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12852076; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12056s; TotalTimePerSample = 0.48223ms; SamplesPerSecond = 2073
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18660498; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12199s; TotalTimePerSample = 0.48794ms; SamplesPerSecond = 2049
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19576025; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12241s; TotalTimePerSample = 0.48964ms; SamplesPerSecond = 2042
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16667627; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12267s; TotalTimePerSample = 0.49067ms; SamplesPerSecond = 2038
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12526172; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12060s; TotalTimePerSample = 0.48239ms; SamplesPerSecond = 2073
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17391992; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12147s; TotalTimePerSample = 0.48588ms; SamplesPerSecond = 2058
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12281641; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12143s; TotalTimePerSample = 0.48574ms; SamplesPerSecond = 2058
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14759424; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19801368; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12211s; TotalTimePerSample = 0.48843ms; SamplesPerSecond = 2047
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12593359; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12245s; TotalTimePerSample = 0.48980ms; SamplesPerSecond = 2041
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13756640; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12191s; TotalTimePerSample = 0.48763ms; SamplesPerSecond = 2050
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48977ms; SamplesPerSecond = 2041
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16654395; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12275s; TotalTimePerSample = 0.49099ms; SamplesPerSecond = 2036
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20658936; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12092s; TotalTimePerSample = 0.48370ms; SamplesPerSecond = 2067
-MPI Rank 1:  Epoch[ 3 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14583300; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047
-MPI Rank 1: Finished Epoch[ 3 of 10]: [Training Set] TrainLossPerSample = 0.15948617; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.919364
-MPI Rank 1: Starting Epoch 4: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12568s; TotalTimePerSample = 0.50274ms; SamplesPerSecond = 1989
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17780230; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12485s; TotalTimePerSample = 0.49939ms; SamplesPerSecond = 2002
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12581s; TotalTimePerSample = 0.50323ms; SamplesPerSecond = 1987
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15796896; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12536s; TotalTimePerSample = 0.50145ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17003000; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50160ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50136ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14643695; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50237ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12591s; TotalTimePerSample = 0.50362ms; SamplesPerSecond = 1985
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50235ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12553s; TotalTimePerSample = 0.50213ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13469094; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12535s; TotalTimePerSample = 0.50142ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12529s; TotalTimePerSample = 0.50116ms; SamplesPerSecond = 1995
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12531s; TotalTimePerSample = 0.50123ms; SamplesPerSecond = 1995
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12510s; TotalTimePerSample = 0.50041ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12515s; TotalTimePerSample = 0.50060ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12592s; TotalTimePerSample = 0.50366ms; SamplesPerSecond = 1985
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12521s; TotalTimePerSample = 0.50083ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12537s; TotalTimePerSample = 0.50146ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19243442; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50205ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12523s; TotalTimePerSample = 0.50090ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50206ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18185536; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12499s; TotalTimePerSample = 0.49998ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14074204; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12514s; TotalTimePerSample = 0.50055ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12501s; TotalTimePerSample = 0.50005ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20299705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12576s; TotalTimePerSample = 0.50303ms; SamplesPerSecond = 1987
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12852037; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50085ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18660440; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50149ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12482s; TotalTimePerSample = 0.49928ms; SamplesPerSecond = 2002
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12597s; TotalTimePerSample = 0.50387ms; SamplesPerSecond = 1984
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12526168; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12510s; TotalTimePerSample = 0.50040ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17392133; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50245ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12501s; TotalTimePerSample = 0.50003ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12533s; TotalTimePerSample = 0.50131ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12529s; TotalTimePerSample = 0.50116ms; SamplesPerSecond = 1995
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16654368; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12546s; TotalTimePerSample = 0.50186ms; SamplesPerSecond = 1992
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20658950; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50140ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12562s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990
+MPI Rank 1: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.016666
+MPI Rank 1: Starting Epoch 4: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 1: starting epoch 3 at record count 30000, and file position 0
 MPI Rank 1: already there from last epoch
 MPI Rank 1: 
 MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12759s; TotalTimePerSample = 0.51034ms; SamplesPerSecond = 1959
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12299s; TotalTimePerSample = 0.49196ms; SamplesPerSecond = 2032
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14239721; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12169s; TotalTimePerSample = 0.48675ms; SamplesPerSecond = 2054
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15630139; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12204s; TotalTimePerSample = 0.48816ms; SamplesPerSecond = 2048
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16935523; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12093s; TotalTimePerSample = 0.48370ms; SamplesPerSecond = 2067
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18198816; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12085s; TotalTimePerSample = 0.48342ms; SamplesPerSecond = 2068
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14475952; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12254s; TotalTimePerSample = 0.49016ms; SamplesPerSecond = 2040
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18021594; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12209s; TotalTimePerSample = 0.48835ms; SamplesPerSecond = 2047
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15849304; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12486s; TotalTimePerSample = 0.49944ms; SamplesPerSecond = 2002
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14474402; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48872ms; SamplesPerSecond = 2046
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13362928; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12227s; TotalTimePerSample = 0.48908ms; SamplesPerSecond = 2044
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13708325; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12183s; TotalTimePerSample = 0.48730ms; SamplesPerSecond = 2052
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.11569763; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12245s; TotalTimePerSample = 0.48978ms; SamplesPerSecond = 2041
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16892321; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12149s; TotalTimePerSample = 0.48596ms; SamplesPerSecond = 2057
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12752125; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12130s; TotalTimePerSample = 0.48520ms; SamplesPerSecond = 2061
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17100880; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17660449; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12219s; TotalTimePerSample = 0.48877ms; SamplesPerSecond = 2045
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14105836; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12170s; TotalTimePerSample = 0.48678ms; SamplesPerSecond = 2054
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19333544; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12125s; TotalTimePerSample = 0.48498ms; SamplesPerSecond = 2061
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20859498; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12070s; TotalTimePerSample = 0.48282ms; SamplesPerSecond = 2071
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18499707; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12381s; TotalTimePerSample = 0.49523ms; SamplesPerSecond = 2019
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18152441; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12165s; TotalTimePerSample = 0.48659ms; SamplesPerSecond = 2055
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14037134; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12260s; TotalTimePerSample = 0.49042ms; SamplesPerSecond = 2039
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14866894; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12392s; TotalTimePerSample = 0.49567ms; SamplesPerSecond = 2017
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20347705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12232s; TotalTimePerSample = 0.48928ms; SamplesPerSecond = 2043
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12815039; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12246s; TotalTimePerSample = 0.48984ms; SamplesPerSecond = 2041
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18672803; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12290s; TotalTimePerSample = 0.49159ms; SamplesPerSecond = 2034
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19552930; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16452637; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12196s; TotalTimePerSample = 0.48784ms; SamplesPerSecond = 2049
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12461865; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12028s; TotalTimePerSample = 0.48112ms; SamplesPerSecond = 2078
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17285107; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12185s; TotalTimePerSample = 0.48740ms; SamplesPerSecond = 2051
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12253613; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12355s; TotalTimePerSample = 0.49418ms; SamplesPerSecond = 2023
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14723291; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12238s; TotalTimePerSample = 0.48953ms; SamplesPerSecond = 2042
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19789551; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12335s; TotalTimePerSample = 0.49340ms; SamplesPerSecond = 2026
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12575878; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12337s; TotalTimePerSample = 0.49350ms; SamplesPerSecond = 2026
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13745947; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12227s; TotalTimePerSample = 0.48908ms; SamplesPerSecond = 2044
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12839746; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12155s; TotalTimePerSample = 0.48619ms; SamplesPerSecond = 2056
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16647315; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12210s; TotalTimePerSample = 0.48839ms; SamplesPerSecond = 2047
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20679444; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12144s; TotalTimePerSample = 0.48577ms; SamplesPerSecond = 2058
-MPI Rank 1:  Epoch[ 4 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14585204; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12220s; TotalTimePerSample = 0.48881ms; SamplesPerSecond = 2045
-MPI Rank 1: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15914927; EvalErrPerSample = 0.076700002; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.927435
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12584s; TotalTimePerSample = 0.50334ms; SamplesPerSecond = 1986
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18070514; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50147ms; SamplesPerSecond = 1994
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14239731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12565s; TotalTimePerSample = 0.50259ms; SamplesPerSecond = 1989
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12544s; TotalTimePerSample = 0.50177ms; SamplesPerSecond = 1992
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12539s; TotalTimePerSample = 0.50154ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12549s; TotalTimePerSample = 0.50194ms; SamplesPerSecond = 1992
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12513s; TotalTimePerSample = 0.50050ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18021602; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12488s; TotalTimePerSample = 0.49954ms; SamplesPerSecond = 2001
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12525s; TotalTimePerSample = 0.50099ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14474426; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12565s; TotalTimePerSample = 0.50260ms; SamplesPerSecond = 1989
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12456s; TotalTimePerSample = 0.49826ms; SamplesPerSecond = 2007
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13708300; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50082ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11569776; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12513s; TotalTimePerSample = 0.50052ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16892330; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12555s; TotalTimePerSample = 0.50222ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12752163; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12519s; TotalTimePerSample = 0.50076ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12561s; TotalTimePerSample = 0.50244ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12497s; TotalTimePerSample = 0.49989ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14105804; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12543s; TotalTimePerSample = 0.50174ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19333553; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50159ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20859525; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12518s; TotalTimePerSample = 0.50071ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12594s; TotalTimePerSample = 0.50376ms; SamplesPerSecond = 1985
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12548s; TotalTimePerSample = 0.50193ms; SamplesPerSecond = 1992
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14037158; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12541s; TotalTimePerSample = 0.50164ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12546s; TotalTimePerSample = 0.50184ms; SamplesPerSecond = 1992
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20347748; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12561s; TotalTimePerSample = 0.50244ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12815013; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50085ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12565s; TotalTimePerSample = 0.50258ms; SamplesPerSecond = 1989
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19552989; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12497s; TotalTimePerSample = 0.49988ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12554s; TotalTimePerSample = 0.50214ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12499s; TotalTimePerSample = 0.49995ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12571s; TotalTimePerSample = 0.50283ms; SamplesPerSecond = 1988
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12562s; TotalTimePerSample = 0.50250ms; SamplesPerSecond = 1990
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14723333; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12541s; TotalTimePerSample = 0.50164ms; SamplesPerSecond = 1993
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12618s; TotalTimePerSample = 0.50470ms; SamplesPerSecond = 1981
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12608s; TotalTimePerSample = 0.50433ms; SamplesPerSecond = 1982
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12556s; TotalTimePerSample = 0.50225ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12497s; TotalTimePerSample = 0.49987ms; SamplesPerSecond = 2000
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12550s; TotalTimePerSample = 0.50201ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989
+MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.01855
+MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
 MPI Rank 1: COMPLETED
 MPI Rank 1: ~MPIWrapper
-MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2
-MPI Rank 2: -------------------------------------------------------------------
-MPI Rank 2: Build info: 
-MPI Rank 2: 
-MPI Rank 2: 		Built time: Aug 25 2015 17:44:46
-MPI Rank 2: 		Last modified date: Mon Aug 24 16:38:42 2015
-MPI Rank 2: 		Built by amitaga on Amitaga-Win-DT3           
-MPI Rank 2: 		Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\
-MPI Rank 2: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
-MPI Rank 2: -------------------------------------------------------------------
-MPI Rank 2: running on Amitaga-Win-DT3 at 2015/08/26 01:48:44
-MPI Rank 2: command line options: 
-MPI Rank 2: configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] 
+MPI Rank 2: running on localhost at 2015/10/24 12:44:54
+MPI Rank 2: command line: 
+MPI Rank 2: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr 
 MPI Rank 2: 
 MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 2: deviceId=$DeviceId$
@@ -1013,12 +1257,11 @@ MPI Rank 2:         minibatchSize=25
 MPI Rank 2:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 2:         momentumPerMB=0.9
 MPI Rank 2:         dropoutRate=0.0
-MPI Rank 2:         maxEpochs=10
+MPI Rank 2:         maxEpochs=4
 MPI Rank 2:         ParallelTrain=[
 MPI Rank 2:             parallelizationMethod=DataParallelSGD
 MPI Rank 2:             DataParallelSGD=[
 MPI Rank 2:               gradientBits=1
-MPI Rank 2:               parallelizationStartEpoch=1
 MPI Rank 2:             ]
 MPI Rank 2:         ]
 MPI Rank 2:     ]
@@ -1040,12 +1283,13 @@ MPI Rank 2:         labelMappingFile=$DataDir$/SimpleMapping.txt
 MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ]
-MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
-MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 2: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 2: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
+MPI Rank 2: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
 MPI Rank 2: DeviceId=0
-MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 2: precision=float
 MPI Rank 2: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 2: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 2: 
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: 
@@ -1056,7 +1300,7 @@ MPI Rank 2: precision=float
 MPI Rank 2: parallelTrain=true
 MPI Rank 2: SimpleMultiGPU=[
 MPI Rank 2:     action=train
-MPI Rank 2:     modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 2:     modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
 MPI Rank 2:     deviceId=0
 MPI Rank 2:     traceLevel=1
 MPI Rank 2:     SimpleNetworkBuilder=[
@@ -1075,18 +1319,17 @@ MPI Rank 2:         minibatchSize=25
 MPI Rank 2:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 2:         momentumPerMB=0.9
 MPI Rank 2:         dropoutRate=0.0
-MPI Rank 2:         maxEpochs=10
+MPI Rank 2:         maxEpochs=4
 MPI Rank 2:         ParallelTrain=[
 MPI Rank 2:             parallelizationMethod=DataParallelSGD
 MPI Rank 2:             DataParallelSGD=[
 MPI Rank 2:               gradientBits=1
-MPI Rank 2:               parallelizationStartEpoch=1
 MPI Rank 2:             ]
 MPI Rank 2:         ]
 MPI Rank 2:     ]
 MPI Rank 2:     reader=[
 MPI Rank 2:       readerType=UCIFastReader
-MPI Rank 2:       file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 2:       file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
 MPI Rank 2:       miniBatchMode=Partial
 MPI Rank 2:       randomize=None
 MPI Rank 2:       verbosity=1   
@@ -1098,29 +1341,31 @@ MPI Rank 2:       labels=[
 MPI Rank 2: start=2      
 MPI Rank 2: dim=1        
 MPI Rank 2: labelDim=2   
-MPI Rank 2:         labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 2:         labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt
 MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ]
-MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
-MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 2: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 2: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
+MPI Rank 2: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
 MPI Rank 2: DeviceId=0
-MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 2: precision=float
 MPI Rank 2: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 2: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 2: 
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: 
 MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 2: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU
-MPI Rank 2: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 2: configparameters: SimpleMultiGPU.config:ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
+MPI Rank 2: configparameters: SimpleMultiGPU.config:DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
 MPI Rank 2: configparameters: SimpleMultiGPU.config:deviceId=0
 MPI Rank 2: configparameters: SimpleMultiGPU.config:parallelTrain=true
 MPI Rank 2: configparameters: SimpleMultiGPU.config:precision=float
-MPI Rank 2: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 2: configparameters: SimpleMultiGPU.config:RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
 MPI Rank 2: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[
 MPI Rank 2:     action=train
-MPI Rank 2:     modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 2:     modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
 MPI Rank 2:     deviceId=0
 MPI Rank 2:     traceLevel=1
 MPI Rank 2:     SimpleNetworkBuilder=[
@@ -1139,18 +1384,17 @@ MPI Rank 2:         minibatchSize=25
 MPI Rank 2:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 2:         momentumPerMB=0.9
 MPI Rank 2:         dropoutRate=0.0
-MPI Rank 2:         maxEpochs=10
+MPI Rank 2:         maxEpochs=4
 MPI Rank 2:         ParallelTrain=[
 MPI Rank 2:             parallelizationMethod=DataParallelSGD
 MPI Rank 2:             DataParallelSGD=[
 MPI Rank 2:               gradientBits=1
-MPI Rank 2:               parallelizationStartEpoch=1
 MPI Rank 2:             ]
 MPI Rank 2:         ]
 MPI Rank 2:     ]
 MPI Rank 2:     reader=[
 MPI Rank 2:       readerType=UCIFastReader
-MPI Rank 2:       file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 2:       file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
 MPI Rank 2:       miniBatchMode=Partial
 MPI Rank 2:       randomize=None
 MPI Rank 2:       verbosity=1   
@@ -1162,45 +1406,100 @@ MPI Rank 2:       labels=[
 MPI Rank 2: start=2      
 MPI Rank 2: dim=1        
 MPI Rank 2: labelDim=2   
-MPI Rank 2:         labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 2:         labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt
 MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
 MPI Rank 2: 
-MPI Rank 2: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 2: configparameters: SimpleMultiGPU.config:stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: command: SimpleMultiGPU 
 MPI Rank 2: precision = float
+MPI Rank 2: CNTKModelPath: /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 2: CNTKCommandTrainInfo: SimpleMultiGPU : 4
+MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+MPI Rank 2: CNTKCommandTrainBegin: SimpleMultiGPU
 MPI Rank 2: SimpleNetworkBuilder Using GPU 0
-MPI Rank 2: reading uci file E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 2: reading uci file /home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
+MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 2: GetTrainCriterionNodes  ...
 MPI Rank 2: GetEvalCriterionNodes  ...
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node CrossEntropyWithSoftmax 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 2: 
-MPI Rank 2: Validating --> labels = InputValue
-MPI Rank 2: Validating --> W2 = LearnableParameter
-MPI Rank 2: Validating --> W1 = LearnableParameter
-MPI Rank 2: Validating --> W0 = LearnableParameter
-MPI Rank 2: Validating --> features = InputValue
-MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, 3])
-MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, 3])
-MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1])
-MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 3])
-MPI Rank 2: Validating --> B0 = LearnableParameter
-MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, 3], B0[50, 1])
-MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, 3])
-MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 3])
-MPI Rank 2: Validating --> B1 = LearnableParameter
-MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, 3], B1[50, 1])
-MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, 3])
-MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 3])
-MPI Rank 2: Validating --> B2 = LearnableParameter
-MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, 3], B2[2, 1])
-MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, 3], HLast[2, 3])
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Precomputing --> 3 PreCompute nodes found.
 MPI Rank 2: 
-MPI Rank 2: Found 3 PreCompute nodes
 MPI Rank 2: 	NodeName: InvStdOfFeatures
 MPI Rank 2: 	NodeName: MeanOfFeatures
 MPI Rank 2: 	NodeName: Prior
@@ -1211,250 +1510,320 @@ MPI Rank 2: starting epoch 0 at record count 0, and file position 0
 MPI Rank 2: already there from last epoch
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node InvStdOfFeatures 
+MPI Rank 2: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 2: 
-MPI Rank 2: Validating --> features = InputValue
-MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, 25])
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node InvStdOfFeatures, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 2: 
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node MeanOfFeatures 
+MPI Rank 2: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 2: 
-MPI Rank 2: Validating --> features = InputValue
-MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, 25])
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node MeanOfFeatures, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 2: 
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node Prior 
+MPI Rank 2: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 2: 
-MPI Rank 2: Validating --> labels = InputValue
-MPI Rank 2: Validating --> Prior = Mean(labels[2, 25])
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node Prior, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+MPI Rank 2: 
+MPI Rank 2: Precomputing --> Completed.
 MPI Rank 2: 
 MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.020000  momentum = 0.900001 
+MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000 
 MPI Rank 2: starting epoch 0 at record count 0, and file position 0
 MPI Rank 2: already there from last epoch
 MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: 
 MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 2: 
-MPI Rank 2: 
-MPI Rank 2: Validating node EvalErrorPrediction 
-MPI Rank 2: 
-MPI Rank 2: Validating --> labels = InputValue
-MPI Rank 2: Validating --> W2 = LearnableParameter
-MPI Rank 2: Validating --> W1 = LearnableParameter
-MPI Rank 2: Validating --> W0 = LearnableParameter
-MPI Rank 2: Validating --> features = InputValue
-MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, 6])
-MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, 6])
-MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 6], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1])
-MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 6])
-MPI Rank 2: Validating --> B0 = LearnableParameter
-MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, 6], B0[50, 1])
-MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, 6])
-MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 6])
-MPI Rank 2: Validating --> B1 = LearnableParameter
-MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, 6], B1[50, 1])
-MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, 6])
-MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 6])
-MPI Rank 2: Validating --> B2 = LearnableParameter
-MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, 6], B2[2, 1])
-MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, 6], HLast[2, 6])
-MPI Rank 2: 
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70007980; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.19882s; TotalTimePerSample = 0.79529ms; SamplesPerSecond = 1257
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71514523; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.15550s; TotalTimePerSample = 0.62202ms; SamplesPerSecond = 1607
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.14886s; TotalTimePerSample = 0.59545ms; SamplesPerSecond = 1679
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70079005; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.14476s; TotalTimePerSample = 0.57903ms; SamplesPerSecond = 1727
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70605618; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.14226s; TotalTimePerSample = 0.56903ms; SamplesPerSecond = 1757
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71572435; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13675s; TotalTimePerSample = 0.54700ms; SamplesPerSecond = 1828
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72149903; EvalErr[0]PerSample = 0.47999999; TotalTime = 0.13630s; TotalTimePerSample = 0.54521ms; SamplesPerSecond = 1834
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.79845655; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13448s; TotalTimePerSample = 0.53790ms; SamplesPerSecond = 1859
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69665188; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.13043s; TotalTimePerSample = 0.52172ms; SamplesPerSecond = 1916
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70723337; EvalErr[0]PerSample = 0.49200001; TotalTime = 0.12787s; TotalTimePerSample = 0.51147ms; SamplesPerSecond = 1955
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71420360; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12629s; TotalTimePerSample = 0.50515ms; SamplesPerSecond = 1979
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69535255; EvalErr[0]PerSample = 0.43599999; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70078611; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12260s; TotalTimePerSample = 0.49040ms; SamplesPerSecond = 2039
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71857810; EvalErr[0]PerSample = 0.54799998; TotalTime = 0.12291s; TotalTimePerSample = 0.49165ms; SamplesPerSecond = 2033
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72088283; EvalErr[0]PerSample = 0.48800001; TotalTime = 0.12232s; TotalTimePerSample = 0.48928ms; SamplesPerSecond = 2043
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71798825; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12123s; TotalTimePerSample = 0.48490ms; SamplesPerSecond = 2062
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.74162209; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12328s; TotalTimePerSample = 0.49314ms; SamplesPerSecond = 2027
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71835059; EvalErr[0]PerSample = 0.51599997; TotalTime = 0.12340s; TotalTimePerSample = 0.49358ms; SamplesPerSecond = 2025
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71529394; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12333s; TotalTimePerSample = 0.49333ms; SamplesPerSecond = 2027
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71727639; EvalErr[0]PerSample = 0.53200001; TotalTime = 0.12471s; TotalTimePerSample = 0.49885ms; SamplesPerSecond = 2004
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71745312; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12360s; TotalTimePerSample = 0.49439ms; SamplesPerSecond = 2022
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72088087; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12196s; TotalTimePerSample = 0.48784ms; SamplesPerSecond = 2049
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72006541; EvalErr[0]PerSample = 0.50800002; TotalTime = 0.12265s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71275192; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12161s; TotalTimePerSample = 0.48644ms; SamplesPerSecond = 2055
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69645119; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12121s; TotalTimePerSample = 0.48486ms; SamplesPerSecond = 2062
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70129883; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12089s; TotalTimePerSample = 0.48356ms; SamplesPerSecond = 2067
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70768166; EvalErr[0]PerSample = 0.54400003; TotalTime = 0.12175s; TotalTimePerSample = 0.48699ms; SamplesPerSecond = 2053
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69744140; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69266015; EvalErr[0]PerSample = 0.44800001; TotalTime = 0.12179s; TotalTimePerSample = 0.48714ms; SamplesPerSecond = 2052
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69347072; EvalErr[0]PerSample = 0.49599999; TotalTime = 0.12233s; TotalTimePerSample = 0.48930ms; SamplesPerSecond = 2043
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69257420; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.12287s; TotalTimePerSample = 0.49146ms; SamplesPerSecond = 2034
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.68625975; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12417s; TotalTimePerSample = 0.49666ms; SamplesPerSecond = 2013
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69064063; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.12339s; TotalTimePerSample = 0.49356ms; SamplesPerSecond = 2026
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70192385; EvalErr[0]PerSample = 0.46000001; TotalTime = 0.12176s; TotalTimePerSample = 0.48705ms; SamplesPerSecond = 2053
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69058985; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.12238s; TotalTimePerSample = 0.48951ms; SamplesPerSecond = 2042
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.67041212; EvalErr[0]PerSample = 0.39199999; TotalTime = 0.12185s; TotalTimePerSample = 0.48741ms; SamplesPerSecond = 2051
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.65914255; EvalErr[0]PerSample = 0.35600001; TotalTime = 0.12262s; TotalTimePerSample = 0.49048ms; SamplesPerSecond = 2038
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.63919920; EvalErr[0]PerSample = 0.36399999; TotalTime = 0.12265s; TotalTimePerSample = 0.49060ms; SamplesPerSecond = 2038
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.61294138; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48570ms; SamplesPerSecond = 2058
-MPI Rank 2:  Epoch[ 1 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.55255663; EvalErr[0]PerSample = 0.18799999; TotalTime = 0.12034s; TotalTimePerSample = 0.48138ms; SamplesPerSecond = 2077
-MPI Rank 2: Finished Epoch[ 1 of 10]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.47350001; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.253381
-MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 2: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal.
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[   1-  10 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12806s; TotalTimePerSample = 0.51223ms; SamplesPerSecond = 1952
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  11-  20 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12488s; TotalTimePerSample = 0.49952ms; SamplesPerSecond = 2001
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  21-  30 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12485s; TotalTimePerSample = 0.49942ms; SamplesPerSecond = 2002
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  31-  40 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49886ms; SamplesPerSecond = 2004
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  41-  50 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70605616; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12438s; TotalTimePerSample = 0.49751ms; SamplesPerSecond = 2010
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  51-  60 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12463s; TotalTimePerSample = 0.49853ms; SamplesPerSecond = 2005
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  61-  70 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.12459s; TotalTimePerSample = 0.49836ms; SamplesPerSecond = 2006
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  71-  80 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.79845605; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12428s; TotalTimePerSample = 0.49713ms; SamplesPerSecond = 2011
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  81-  90 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12443s; TotalTimePerSample = 0.49771ms; SamplesPerSecond = 2009
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71420344; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12487s; TotalTimePerSample = 0.49946ms; SamplesPerSecond = 2002
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12511s; TotalTimePerSample = 0.50045ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71857915; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.12473s; TotalTimePerSample = 0.49891ms; SamplesPerSecond = 2004
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.12486s; TotalTimePerSample = 0.49944ms; SamplesPerSecond = 2002
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71798839; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12506s; TotalTimePerSample = 0.50025ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12442s; TotalTimePerSample = 0.49768ms; SamplesPerSecond = 2009
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71835127; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.12471s; TotalTimePerSample = 0.49883ms; SamplesPerSecond = 2004
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71529462; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12444s; TotalTimePerSample = 0.49777ms; SamplesPerSecond = 2008
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71727656; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12439s; TotalTimePerSample = 0.49756ms; SamplesPerSecond = 2009
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12478s; TotalTimePerSample = 0.49913ms; SamplesPerSecond = 2003
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088397; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12447s; TotalTimePerSample = 0.49788ms; SamplesPerSecond = 2008
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72006808; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12491s; TotalTimePerSample = 0.49964ms; SamplesPerSecond = 2001
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12463s; TotalTimePerSample = 0.49852ms; SamplesPerSecond = 2005
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70129697; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12496s; TotalTimePerSample = 0.49985ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12507s; TotalTimePerSample = 0.50029ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12537s; TotalTimePerSample = 0.50148ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69266186; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.12541s; TotalTimePerSample = 0.50163ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12475s; TotalTimePerSample = 0.49900ms; SamplesPerSecond = 2004
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69257410; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12566s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12519s; TotalTimePerSample = 0.50077ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70192153; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50135ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12528s; TotalTimePerSample = 0.50111ms; SamplesPerSecond = 1995
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.67041489; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12603s; TotalTimePerSample = 0.50413ms; SamplesPerSecond = 1983
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.65913971; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50140ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.63919874; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.12492s; TotalTimePerSample = 0.49966ms; SamplesPerSecond = 2001
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.61293878; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12545s; TotalTimePerSample = 0.50180ms; SamplesPerSecond = 1992
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.55255340; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.12484s; TotalTimePerSample = 0.49934ms; SamplesPerSecond = 2002
+MPI Rank 2: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.008988
+MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 2: starting epoch 1 at record count 10000, and file position 0
 MPI Rank 2: already there from last epoch
 MPI Rank 2: 
 MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.50775200; EvalErr[0]PerSample = 0.23999999; TotalTime = 0.12817s; TotalTimePerSample = 0.51266ms; SamplesPerSecond = 1950
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.43389454; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12188s; TotalTimePerSample = 0.48752ms; SamplesPerSecond = 2051
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.36675408; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12076s; TotalTimePerSample = 0.48305ms; SamplesPerSecond = 2070
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.33769274; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12255s; TotalTimePerSample = 0.49018ms; SamplesPerSecond = 2040
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.30321363; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12132s; TotalTimePerSample = 0.48528ms; SamplesPerSecond = 2060
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.29576379; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12054s; TotalTimePerSample = 0.48217ms; SamplesPerSecond = 2073
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.24924731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.24632569; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12329s; TotalTimePerSample = 0.49317ms; SamplesPerSecond = 2027
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20943311; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12288s; TotalTimePerSample = 0.49151ms; SamplesPerSecond = 2034
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19116065; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12191s; TotalTimePerSample = 0.48763ms; SamplesPerSecond = 2050
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17923315; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48971ms; SamplesPerSecond = 2042
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17075513; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12159s; TotalTimePerSample = 0.48638ms; SamplesPerSecond = 2056
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14442432; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12214s; TotalTimePerSample = 0.48854ms; SamplesPerSecond = 2046
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17753857; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12283s; TotalTimePerSample = 0.49132ms; SamplesPerSecond = 2035
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15087914; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12183s; TotalTimePerSample = 0.48733ms; SamplesPerSecond = 2052
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19252978; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12265s; TotalTimePerSample = 0.49059ms; SamplesPerSecond = 2038
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17830664; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49066ms; SamplesPerSecond = 2038
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15115429; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12254s; TotalTimePerSample = 0.49015ms; SamplesPerSecond = 2040
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19135889; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12209s; TotalTimePerSample = 0.48838ms; SamplesPerSecond = 2047
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.21491407; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12304s; TotalTimePerSample = 0.49215ms; SamplesPerSecond = 2031
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18682373; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12157s; TotalTimePerSample = 0.48626ms; SamplesPerSecond = 2056
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18483251; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12136s; TotalTimePerSample = 0.48544ms; SamplesPerSecond = 2059
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14684522; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15322119; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12308s; TotalTimePerSample = 0.49232ms; SamplesPerSecond = 2031
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19882520; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12375s; TotalTimePerSample = 0.49501ms; SamplesPerSecond = 2020
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13683788; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18621191; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12193s; TotalTimePerSample = 0.48772ms; SamplesPerSecond = 2050
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19408056; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12169s; TotalTimePerSample = 0.48674ms; SamplesPerSecond = 2054
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17298096; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48873ms; SamplesPerSecond = 2046
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13265137; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12193s; TotalTimePerSample = 0.48772ms; SamplesPerSecond = 2050
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17627051; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12157s; TotalTimePerSample = 0.48628ms; SamplesPerSecond = 2056
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12734570; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12410s; TotalTimePerSample = 0.49638ms; SamplesPerSecond = 2014
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15108399; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12118s; TotalTimePerSample = 0.48473ms; SamplesPerSecond = 2063
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19729199; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12267s; TotalTimePerSample = 0.49067ms; SamplesPerSecond = 2038
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12857373; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12203s; TotalTimePerSample = 0.48812ms; SamplesPerSecond = 2048
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13867822; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12274s; TotalTimePerSample = 0.49095ms; SamplesPerSecond = 2036
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12786084; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12170s; TotalTimePerSample = 0.48682ms; SamplesPerSecond = 2054
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16643262; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20440333; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12238s; TotalTimePerSample = 0.48951ms; SamplesPerSecond = 2042
-MPI Rank 2:  Epoch[ 2 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14566259; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044
-MPI Rank 2: Finished Epoch[ 2 of 10]: [Training Set] TrainLossPerSample = 0.20373113; EvalErrPerSample = 0.082699999; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.929244
-MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.50774607; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.12606s; TotalTimePerSample = 0.50425ms; SamplesPerSecond = 1983
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.43388910; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.36674852; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12573s; TotalTimePerSample = 0.50294ms; SamplesPerSecond = 1988
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.33768746; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12522s; TotalTimePerSample = 0.50089ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.30320932; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12500s; TotalTimePerSample = 0.50001ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.29576032; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12562s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24924483; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12481s; TotalTimePerSample = 0.49925ms; SamplesPerSecond = 2003
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24632409; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12575s; TotalTimePerSample = 0.50299ms; SamplesPerSecond = 1988
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20943152; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50078ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19115992; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12536s; TotalTimePerSample = 0.50144ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17923227; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17075420; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12523s; TotalTimePerSample = 0.50093ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14442369; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12530s; TotalTimePerSample = 0.50120ms; SamplesPerSecond = 1995
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17753818; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12512s; TotalTimePerSample = 0.50048ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15087853; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50084ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19253021; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12531s; TotalTimePerSample = 0.50125ms; SamplesPerSecond = 1995
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12510s; TotalTimePerSample = 0.50039ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12497s; TotalTimePerSample = 0.49987ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12524s; TotalTimePerSample = 0.50096ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12497s; TotalTimePerSample = 0.49989ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12528s; TotalTimePerSample = 0.50113ms; SamplesPerSecond = 1995
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12500s; TotalTimePerSample = 0.49998ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14684503; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12496s; TotalTimePerSample = 0.49986ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15322116; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12528s; TotalTimePerSample = 0.50110ms; SamplesPerSecond = 1995
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12508s; TotalTimePerSample = 0.50033ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13683832; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50066ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18621189; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12489s; TotalTimePerSample = 0.49958ms; SamplesPerSecond = 2001
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19408050; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12602s; TotalTimePerSample = 0.50408ms; SamplesPerSecond = 1983
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50064ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12564s; TotalTimePerSample = 0.50257ms; SamplesPerSecond = 1989
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15108452; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12521s; TotalTimePerSample = 0.50085ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19729185; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12571s; TotalTimePerSample = 0.50283ms; SamplesPerSecond = 1988
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12857333; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50021ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12550s; TotalTimePerSample = 0.50201ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12786051; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50204ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12505s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20440408; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12575s; TotalTimePerSample = 0.50302ms; SamplesPerSecond = 1988
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14566237; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12517s; TotalTimePerSample = 0.50067ms; SamplesPerSecond = 1997
+MPI Rank 2: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373026; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.013494
+MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 2: starting epoch 2 at record count 20000, and file position 0
 MPI Rank 2: already there from last epoch
 MPI Rank 2: 
 MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12590086; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12913s; TotalTimePerSample = 0.51653ms; SamplesPerSecond = 1935
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17780226; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12397s; TotalTimePerSample = 0.49587ms; SamplesPerSecond = 2016
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14417633; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12281s; TotalTimePerSample = 0.49122ms; SamplesPerSecond = 2035
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15796880; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12283s; TotalTimePerSample = 0.49132ms; SamplesPerSecond = 2035
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17002991; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12299s; TotalTimePerSample = 0.49197ms; SamplesPerSecond = 2032
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18262109; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12231s; TotalTimePerSample = 0.48926ms; SamplesPerSecond = 2043
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14643688; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12104s; TotalTimePerSample = 0.48418ms; SamplesPerSecond = 2065
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18030518; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12215s; TotalTimePerSample = 0.48860ms; SamplesPerSecond = 2046
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15846142; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12214s; TotalTimePerSample = 0.48858ms; SamplesPerSecond = 2046
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14486536; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12133s; TotalTimePerSample = 0.48530ms; SamplesPerSecond = 2060
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13469091; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12151s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12147s; TotalTimePerSample = 0.48589ms; SamplesPerSecond = 2058
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.11641297; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16786633; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12218s; TotalTimePerSample = 0.48872ms; SamplesPerSecond = 2046
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12811548; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12211s; TotalTimePerSample = 0.48843ms; SamplesPerSecond = 2047
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17257836; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12252s; TotalTimePerSample = 0.49008ms; SamplesPerSecond = 2040
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17623682; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12011s; TotalTimePerSample = 0.48045ms; SamplesPerSecond = 2081
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14121118; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12143s; TotalTimePerSample = 0.48572ms; SamplesPerSecond = 2058
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19243409; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12070s; TotalTimePerSample = 0.48280ms; SamplesPerSecond = 2071
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20908155; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12150s; TotalTimePerSample = 0.48598ms; SamplesPerSecond = 2057
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18472095; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12149s; TotalTimePerSample = 0.48598ms; SamplesPerSecond = 2057
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18185547; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12188s; TotalTimePerSample = 0.48752ms; SamplesPerSecond = 2051
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14074194; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12174s; TotalTimePerSample = 0.48697ms; SamplesPerSecond = 2053
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14871632; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12189s; TotalTimePerSample = 0.48758ms; SamplesPerSecond = 2050
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20299682; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12199s; TotalTimePerSample = 0.48796ms; SamplesPerSecond = 2049
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12852076; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12055s; TotalTimePerSample = 0.48218ms; SamplesPerSecond = 2073
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18660498; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12198s; TotalTimePerSample = 0.48793ms; SamplesPerSecond = 2049
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19576025; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12240s; TotalTimePerSample = 0.48961ms; SamplesPerSecond = 2042
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16667627; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49062ms; SamplesPerSecond = 2038
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12526172; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12059s; TotalTimePerSample = 0.48236ms; SamplesPerSecond = 2073
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17391992; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12146s; TotalTimePerSample = 0.48585ms; SamplesPerSecond = 2058
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12281641; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48568ms; SamplesPerSecond = 2058
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14759424; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12211s; TotalTimePerSample = 0.48842ms; SamplesPerSecond = 2047
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19801368; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12210s; TotalTimePerSample = 0.48840ms; SamplesPerSecond = 2047
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12593359; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12244s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13756640; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12191s; TotalTimePerSample = 0.48764ms; SamplesPerSecond = 2050
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16654395; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12275s; TotalTimePerSample = 0.49099ms; SamplesPerSecond = 2036
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20658936; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12092s; TotalTimePerSample = 0.48367ms; SamplesPerSecond = 2067
-MPI Rank 2:  Epoch[ 3 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14583300; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12208s; TotalTimePerSample = 0.48833ms; SamplesPerSecond = 2047
-MPI Rank 2: Finished Epoch[ 3 of 10]: [Training Set] TrainLossPerSample = 0.15948617; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.919216
-MPI Rank 2: Starting Epoch 4: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12574s; TotalTimePerSample = 0.50295ms; SamplesPerSecond = 1988
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17780230; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12505s; TotalTimePerSample = 0.50020ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12561s; TotalTimePerSample = 0.50243ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15796896; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12537s; TotalTimePerSample = 0.50146ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17003000; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50161ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12511s; TotalTimePerSample = 0.50045ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14643695; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12582s; TotalTimePerSample = 0.50330ms; SamplesPerSecond = 1986
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12591s; TotalTimePerSample = 0.50363ms; SamplesPerSecond = 1985
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50235ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12554s; TotalTimePerSample = 0.50214ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13469094; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12556s; TotalTimePerSample = 0.50224ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12509s; TotalTimePerSample = 0.50035ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12531s; TotalTimePerSample = 0.50124ms; SamplesPerSecond = 1995
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12510s; TotalTimePerSample = 0.50040ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12516s; TotalTimePerSample = 0.50063ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12592s; TotalTimePerSample = 0.50368ms; SamplesPerSecond = 1985
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12542s; TotalTimePerSample = 0.50166ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12516s; TotalTimePerSample = 0.50064ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19243442; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12552s; TotalTimePerSample = 0.50206ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12500s; TotalTimePerSample = 0.49999ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12575s; TotalTimePerSample = 0.50299ms; SamplesPerSecond = 1988
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18185536; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12514s; TotalTimePerSample = 0.50057ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14074204; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12507s; TotalTimePerSample = 0.50029ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12522s; TotalTimePerSample = 0.50087ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20299705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12556s; TotalTimePerSample = 0.50223ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12852037; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12522s; TotalTimePerSample = 0.50087ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18660440; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12514s; TotalTimePerSample = 0.50057ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12505s; TotalTimePerSample = 0.50022ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12596s; TotalTimePerSample = 0.50386ms; SamplesPerSecond = 1984
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12526168; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17392133; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50246ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12485s; TotalTimePerSample = 0.49940ms; SamplesPerSecond = 2002
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12524s; TotalTimePerSample = 0.50097ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12533s; TotalTimePerSample = 0.50132ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12529s; TotalTimePerSample = 0.50118ms; SamplesPerSecond = 1995
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16654368; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50188ms; SamplesPerSecond = 1992
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20658950; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50142ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12561s; TotalTimePerSample = 0.50244ms; SamplesPerSecond = 1990
+MPI Rank 2: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.016697
+MPI Rank 2: Starting Epoch 4: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 2: starting epoch 3 at record count 30000, and file position 0
 MPI Rank 2: already there from last epoch
 MPI Rank 2: 
 MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12757s; TotalTimePerSample = 0.51028ms; SamplesPerSecond = 1959
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12299s; TotalTimePerSample = 0.49195ms; SamplesPerSecond = 2032
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14239721; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12165s; TotalTimePerSample = 0.48662ms; SamplesPerSecond = 2054
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15630139; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12202s; TotalTimePerSample = 0.48810ms; SamplesPerSecond = 2048
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16935523; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12091s; TotalTimePerSample = 0.48366ms; SamplesPerSecond = 2067
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18198816; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12087s; TotalTimePerSample = 0.48349ms; SamplesPerSecond = 2068
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14475952; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12254s; TotalTimePerSample = 0.49015ms; SamplesPerSecond = 2040
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18021594; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12208s; TotalTimePerSample = 0.48833ms; SamplesPerSecond = 2047
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15849304; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12485s; TotalTimePerSample = 0.49942ms; SamplesPerSecond = 2002
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14474402; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48871ms; SamplesPerSecond = 2046
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13362928; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12226s; TotalTimePerSample = 0.48905ms; SamplesPerSecond = 2044
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13708325; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12181s; TotalTimePerSample = 0.48724ms; SamplesPerSecond = 2052
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.11569763; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12244s; TotalTimePerSample = 0.48975ms; SamplesPerSecond = 2041
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16892321; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12148s; TotalTimePerSample = 0.48594ms; SamplesPerSecond = 2057
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12752125; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12131s; TotalTimePerSample = 0.48522ms; SamplesPerSecond = 2060
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17100880; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12211s; TotalTimePerSample = 0.48844ms; SamplesPerSecond = 2047
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17660449; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12219s; TotalTimePerSample = 0.48876ms; SamplesPerSecond = 2045
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14105836; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12168s; TotalTimePerSample = 0.48672ms; SamplesPerSecond = 2054
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19333544; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12124s; TotalTimePerSample = 0.48494ms; SamplesPerSecond = 2062
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20859498; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12071s; TotalTimePerSample = 0.48283ms; SamplesPerSecond = 2071
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18499707; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12379s; TotalTimePerSample = 0.49516ms; SamplesPerSecond = 2019
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18152441; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12164s; TotalTimePerSample = 0.48655ms; SamplesPerSecond = 2055
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14037134; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12260s; TotalTimePerSample = 0.49039ms; SamplesPerSecond = 2039
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14866894; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12391s; TotalTimePerSample = 0.49563ms; SamplesPerSecond = 2017
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20347705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12230s; TotalTimePerSample = 0.48922ms; SamplesPerSecond = 2044
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12815039; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12247s; TotalTimePerSample = 0.48989ms; SamplesPerSecond = 2041
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18672803; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12289s; TotalTimePerSample = 0.49155ms; SamplesPerSecond = 2034
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19552930; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12108s; TotalTimePerSample = 0.48431ms; SamplesPerSecond = 2064
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16452637; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12196s; TotalTimePerSample = 0.48783ms; SamplesPerSecond = 2049
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12461865; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12027s; TotalTimePerSample = 0.48108ms; SamplesPerSecond = 2078
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17285107; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12184s; TotalTimePerSample = 0.48738ms; SamplesPerSecond = 2051
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12253613; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12353s; TotalTimePerSample = 0.49410ms; SamplesPerSecond = 2023
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14723291; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12238s; TotalTimePerSample = 0.48952ms; SamplesPerSecond = 2042
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19789551; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12335s; TotalTimePerSample = 0.49339ms; SamplesPerSecond = 2026
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12575878; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12336s; TotalTimePerSample = 0.49344ms; SamplesPerSecond = 2026
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13745947; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12227s; TotalTimePerSample = 0.48906ms; SamplesPerSecond = 2044
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12839746; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12153s; TotalTimePerSample = 0.48612ms; SamplesPerSecond = 2057
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16647315; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12210s; TotalTimePerSample = 0.48838ms; SamplesPerSecond = 2047
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20679444; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12144s; TotalTimePerSample = 0.48576ms; SamplesPerSecond = 2058
-MPI Rank 2:  Epoch[ 4 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14585204; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12219s; TotalTimePerSample = 0.48878ms; SamplesPerSecond = 2045
-MPI Rank 2: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15914927; EvalErrPerSample = 0.076700002; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.9272
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12588s; TotalTimePerSample = 0.50353ms; SamplesPerSecond = 1985
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18070514; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50147ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14239731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12565s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12544s; TotalTimePerSample = 0.50175ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12540s; TotalTimePerSample = 0.50158ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12549s; TotalTimePerSample = 0.50196ms; SamplesPerSecond = 1992
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12533s; TotalTimePerSample = 0.50132ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18021602; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12467s; TotalTimePerSample = 0.49869ms; SamplesPerSecond = 2005
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12526s; TotalTimePerSample = 0.50102ms; SamplesPerSecond = 1995
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14474426; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12565s; TotalTimePerSample = 0.50260ms; SamplesPerSecond = 1989
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12457s; TotalTimePerSample = 0.49827ms; SamplesPerSecond = 2006
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13708300; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12521s; TotalTimePerSample = 0.50084ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11569776; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12513s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16892330; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12556s; TotalTimePerSample = 0.50224ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12752163; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12519s; TotalTimePerSample = 0.50077ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12561s; TotalTimePerSample = 0.50244ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12498s; TotalTimePerSample = 0.49991ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14105804; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12544s; TotalTimePerSample = 0.50175ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19333553; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12517s; TotalTimePerSample = 0.50068ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20859525; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12541s; TotalTimePerSample = 0.50164ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12594s; TotalTimePerSample = 0.50377ms; SamplesPerSecond = 1985
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50148ms; SamplesPerSecond = 1994
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14037158; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12552s; TotalTimePerSample = 0.50210ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12546s; TotalTimePerSample = 0.50185ms; SamplesPerSecond = 1992
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20347748; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12561s; TotalTimePerSample = 0.50245ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12815013; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12522s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12548s; TotalTimePerSample = 0.50193ms; SamplesPerSecond = 1992
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19552989; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12513s; TotalTimePerSample = 0.50053ms; SamplesPerSecond = 1997
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12554s; TotalTimePerSample = 0.50216ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12499s; TotalTimePerSample = 0.49995ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12571s; TotalTimePerSample = 0.50285ms; SamplesPerSecond = 1988
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12563s; TotalTimePerSample = 0.50251ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14723333; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12541s; TotalTimePerSample = 0.50165ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12616s; TotalTimePerSample = 0.50464ms; SamplesPerSecond = 1981
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12608s; TotalTimePerSample = 0.50434ms; SamplesPerSecond = 1982
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12557s; TotalTimePerSample = 0.50227ms; SamplesPerSecond = 1990
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12497s; TotalTimePerSample = 0.49987ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12551s; TotalTimePerSample = 0.50202ms; SamplesPerSecond = 1991
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50262ms; SamplesPerSecond = 1989
+MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.018583
+MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
 MPI Rank 2: COMPLETED
 MPI Rank 2: ~MPIWrapper
-MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3
-MPI Rank 3: -------------------------------------------------------------------
-MPI Rank 3: Build info: 
-MPI Rank 3: 
-MPI Rank 3: 		Built time: Aug 25 2015 17:44:46
-MPI Rank 3: 		Last modified date: Mon Aug 24 16:38:42 2015
-MPI Rank 3: 		Built by amitaga on Amitaga-Win-DT3           
-MPI Rank 3: 		Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\
-MPI Rank 3: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
-MPI Rank 3: -------------------------------------------------------------------
-MPI Rank 3: running on Amitaga-Win-DT3 at 2015/08/26 01:48:44
-MPI Rank 3: command line options: 
-MPI Rank 3: configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] 
+MPI Rank 3: running on localhost at 2015/10/24 12:44:55
+MPI Rank 3: command line: 
+MPI Rank 3: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../../SimpleMultiGPU.config RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../.. DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr 
 MPI Rank 3: 
 MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 3: deviceId=$DeviceId$
@@ -1482,12 +1851,11 @@ MPI Rank 3:         minibatchSize=25
 MPI Rank 3:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 3:         momentumPerMB=0.9
 MPI Rank 3:         dropoutRate=0.0
-MPI Rank 3:         maxEpochs=10
+MPI Rank 3:         maxEpochs=4
 MPI Rank 3:         ParallelTrain=[
 MPI Rank 3:             parallelizationMethod=DataParallelSGD
 MPI Rank 3:             DataParallelSGD=[
 MPI Rank 3:               gradientBits=1
-MPI Rank 3:               parallelizationStartEpoch=1
 MPI Rank 3:             ]
 MPI Rank 3:         ]
 MPI Rank 3:     ]
@@ -1509,12 +1877,13 @@ MPI Rank 3:         labelMappingFile=$DataDir$/SimpleMapping.txt
 MPI Rank 3:       ]
 MPI Rank 3:     ]
 MPI Rank 3: ]
-MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
-MPI Rank 3: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 3: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 3: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
+MPI Rank 3: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
 MPI Rank 3: DeviceId=0
-MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 3: precision=float
 MPI Rank 3: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 3: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 3: 
 MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 3: 
@@ -1525,7 +1894,7 @@ MPI Rank 3: precision=float
 MPI Rank 3: parallelTrain=true
 MPI Rank 3: SimpleMultiGPU=[
 MPI Rank 3:     action=train
-MPI Rank 3:     modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 3:     modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
 MPI Rank 3:     deviceId=0
 MPI Rank 3:     traceLevel=1
 MPI Rank 3:     SimpleNetworkBuilder=[
@@ -1544,18 +1913,17 @@ MPI Rank 3:         minibatchSize=25
 MPI Rank 3:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 3:         momentumPerMB=0.9
 MPI Rank 3:         dropoutRate=0.0
-MPI Rank 3:         maxEpochs=10
+MPI Rank 3:         maxEpochs=4
 MPI Rank 3:         ParallelTrain=[
 MPI Rank 3:             parallelizationMethod=DataParallelSGD
 MPI Rank 3:             DataParallelSGD=[
 MPI Rank 3:               gradientBits=1
-MPI Rank 3:               parallelizationStartEpoch=1
 MPI Rank 3:             ]
 MPI Rank 3:         ]
 MPI Rank 3:     ]
 MPI Rank 3:     reader=[
 MPI Rank 3:       readerType=UCIFastReader
-MPI Rank 3:       file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 3:       file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
 MPI Rank 3:       miniBatchMode=Partial
 MPI Rank 3:       randomize=None
 MPI Rank 3:       verbosity=1   
@@ -1567,29 +1935,31 @@ MPI Rank 3:       labels=[
 MPI Rank 3: start=2      
 MPI Rank 3: dim=1        
 MPI Rank 3: labelDim=2   
-MPI Rank 3:         labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 3:         labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt
 MPI Rank 3:       ]
 MPI Rank 3:     ]
 MPI Rank 3: ]
-MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
-MPI Rank 3: DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 3: RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 3: DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
+MPI Rank 3: ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
 MPI Rank 3: DeviceId=0
-MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 3: precision=float
 MPI Rank 3: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 3: stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 3: 
 MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 3: 
 MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 3: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU
-MPI Rank 3: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data
+MPI Rank 3: configparameters: SimpleMultiGPU.config:ConfigDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/NoQuantization/SinglePrecision/../..
+MPI Rank 3: configparameters: SimpleMultiGPU.config:DataDir=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data
 MPI Rank 3: configparameters: SimpleMultiGPU.config:deviceId=0
 MPI Rank 3: configparameters: SimpleMultiGPU.config:parallelTrain=true
 MPI Rank 3: configparameters: SimpleMultiGPU.config:precision=float
-MPI Rank 3: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 3: configparameters: SimpleMultiGPU.config:RunDir=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu
 MPI Rank 3: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[
 MPI Rank 3:     action=train
-MPI Rank 3:     modelPath=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 3:     modelPath=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
 MPI Rank 3:     deviceId=0
 MPI Rank 3:     traceLevel=1
 MPI Rank 3:     SimpleNetworkBuilder=[
@@ -1608,18 +1978,17 @@ MPI Rank 3:         minibatchSize=25
 MPI Rank 3:         learningRatesPerMB=0.5:0.2*20:0.1
 MPI Rank 3:         momentumPerMB=0.9
 MPI Rank 3:         dropoutRate=0.0
-MPI Rank 3:         maxEpochs=10
+MPI Rank 3:         maxEpochs=4
 MPI Rank 3:         ParallelTrain=[
 MPI Rank 3:             parallelizationMethod=DataParallelSGD
 MPI Rank 3:             DataParallelSGD=[
 MPI Rank 3:               gradientBits=1
-MPI Rank 3:               parallelizationStartEpoch=1
 MPI Rank 3:             ]
 MPI Rank 3:         ]
 MPI Rank 3:     ]
 MPI Rank 3:     reader=[
 MPI Rank 3:       readerType=UCIFastReader
-MPI Rank 3:       file=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 3:       file=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
 MPI Rank 3:       miniBatchMode=Partial
 MPI Rank 3:       randomize=None
 MPI Rank 3:       verbosity=1   
@@ -1631,45 +2000,100 @@ MPI Rank 3:       labels=[
 MPI Rank 3: start=2      
 MPI Rank 3: dim=1        
 MPI Rank 3: labelDim=2   
-MPI Rank 3:         labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 3:         labelMappingFile=/home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleMapping.txt
 MPI Rank 3:       ]
 MPI Rank 3:     ]
 MPI Rank 3: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
 MPI Rank 3: 
-MPI Rank 3: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20150825174842.581682\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 3: configparameters: SimpleMultiGPU.config:stderr=/tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/stderr
 MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 3: command: SimpleMultiGPU 
 MPI Rank 3: precision = float
+MPI Rank 3: CNTKModelPath: /tmp/cntk-test-20151024124453.495855/ParallelTraining/NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 3: CNTKCommandTrainInfo: SimpleMultiGPU : 4
+MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+MPI Rank 3: CNTKCommandTrainBegin: SimpleMultiGPU
 MPI Rank 3: SimpleNetworkBuilder Using GPU 0
-MPI Rank 3: reading uci file E:\NetScale\CNTK\git_repos\public_master\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 3: reading uci file /home/mluser/src/cplx_master/Tests/ParallelTraining/Data/SimpleDataTrain.txt
+MPI Rank 3: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 3: GetTrainCriterionNodes  ...
 MPI Rank 3: GetEvalCriterionNodes  ...
 MPI Rank 3: 
 MPI Rank 3: 
-MPI Rank 3: Validating node CrossEntropyWithSoftmax 
+MPI Rank 3: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 3: 
-MPI Rank 3: Validating --> labels = InputValue
-MPI Rank 3: Validating --> W2 = LearnableParameter
-MPI Rank 3: Validating --> W1 = LearnableParameter
-MPI Rank 3: Validating --> W0 = LearnableParameter
-MPI Rank 3: Validating --> features = InputValue
-MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, 3])
-MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, 3])
-MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1])
-MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 3])
-MPI Rank 3: Validating --> B0 = LearnableParameter
-MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, 3], B0[50, 1])
-MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, 3])
-MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 3])
-MPI Rank 3: Validating --> B1 = LearnableParameter
-MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, 3], B1[50, 1])
-MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, 3])
-MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 3])
-MPI Rank 3: Validating --> B2 = LearnableParameter
-MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, 3], B2[2, 1])
-MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, 3], HLast[2, 3])
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Precomputing --> 3 PreCompute nodes found.
 MPI Rank 3: 
-MPI Rank 3: Found 3 PreCompute nodes
 MPI Rank 3: 	NodeName: InvStdOfFeatures
 MPI Rank 3: 	NodeName: MeanOfFeatures
 MPI Rank 3: 	NodeName: Prior
@@ -1680,234 +2104,314 @@ MPI Rank 3: starting epoch 0 at record count 0, and file position 0
 MPI Rank 3: already there from last epoch
 MPI Rank 3: 
 MPI Rank 3: 
-MPI Rank 3: Validating node InvStdOfFeatures 
+MPI Rank 3: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 3: 
-MPI Rank 3: Validating --> features = InputValue
-MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, 25])
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node InvStdOfFeatures, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 3: 
 MPI Rank 3: 
 MPI Rank 3: 
-MPI Rank 3: Validating node MeanOfFeatures 
+MPI Rank 3: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 3: 
-MPI Rank 3: Validating --> features = InputValue
-MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, 25])
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node MeanOfFeatures, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 3: 
 MPI Rank 3: 
 MPI Rank 3: 
-MPI Rank 3: Validating node Prior 
+MPI Rank 3: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 3: 
-MPI Rank 3: Validating --> labels = InputValue
-MPI Rank 3: Validating --> Prior = Mean(labels[2, 25])
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node Prior, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 3: 
+MPI Rank 3: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+MPI Rank 3: 
+MPI Rank 3: Precomputing --> Completed.
 MPI Rank 3: 
 MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.020000  momentum = 0.900001 
+MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000 
 MPI Rank 3: starting epoch 0 at record count 0, and file position 0
 MPI Rank 3: already there from last epoch
 MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 3: 
+MPI Rank 3: 
 MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 3: 
-MPI Rank 3: 
-MPI Rank 3: Validating node EvalErrorPrediction 
-MPI Rank 3: 
-MPI Rank 3: Validating --> labels = InputValue
-MPI Rank 3: Validating --> W2 = LearnableParameter
-MPI Rank 3: Validating --> W1 = LearnableParameter
-MPI Rank 3: Validating --> W0 = LearnableParameter
-MPI Rank 3: Validating --> features = InputValue
-MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, 7])
-MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, 7])
-MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, 7], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1])
-MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, 7])
-MPI Rank 3: Validating --> B0 = LearnableParameter
-MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, 7], B0[50, 1])
-MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, 7])
-MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, 7])
-MPI Rank 3: Validating --> B1 = LearnableParameter
-MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, 7], B1[50, 1])
-MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, 7])
-MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, 7])
-MPI Rank 3: Validating --> B2 = LearnableParameter
-MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, 7], B2[2, 1])
-MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, 7], HLast[2, 7])
-MPI Rank 3: 
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70007980; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.20117s; TotalTimePerSample = 0.80470ms; SamplesPerSecond = 1242
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71514523; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.15541s; TotalTimePerSample = 0.62162ms; SamplesPerSecond = 1608
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.14893s; TotalTimePerSample = 0.59571ms; SamplesPerSecond = 1678
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70079005; EvalErr[0]PerSample = 0.52399999; TotalTime = 0.14465s; TotalTimePerSample = 0.57860ms; SamplesPerSecond = 1728
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70605618; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.14226s; TotalTimePerSample = 0.56906ms; SamplesPerSecond = 1757
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71572435; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13673s; TotalTimePerSample = 0.54692ms; SamplesPerSecond = 1828
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72149903; EvalErr[0]PerSample = 0.47999999; TotalTime = 0.13632s; TotalTimePerSample = 0.54528ms; SamplesPerSecond = 1833
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.79845655; EvalErr[0]PerSample = 0.47600001; TotalTime = 0.13450s; TotalTimePerSample = 0.53800ms; SamplesPerSecond = 1858
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69665188; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.13040s; TotalTimePerSample = 0.52161ms; SamplesPerSecond = 1917
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70723337; EvalErr[0]PerSample = 0.49200001; TotalTime = 0.12784s; TotalTimePerSample = 0.51137ms; SamplesPerSecond = 1955
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71420360; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12616s; TotalTimePerSample = 0.50466ms; SamplesPerSecond = 1981
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69535255; EvalErr[0]PerSample = 0.43599999; TotalTime = 0.12550s; TotalTimePerSample = 0.50198ms; SamplesPerSecond = 1992
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70078611; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12245s; TotalTimePerSample = 0.48982ms; SamplesPerSecond = 2041
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71857810; EvalErr[0]PerSample = 0.54799998; TotalTime = 0.12284s; TotalTimePerSample = 0.49136ms; SamplesPerSecond = 2035
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72088283; EvalErr[0]PerSample = 0.48800001; TotalTime = 0.12223s; TotalTimePerSample = 0.48893ms; SamplesPerSecond = 2045
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71798825; EvalErr[0]PerSample = 0.55199999; TotalTime = 0.12113s; TotalTimePerSample = 0.48453ms; SamplesPerSecond = 2063
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.74162209; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12325s; TotalTimePerSample = 0.49300ms; SamplesPerSecond = 2028
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71835059; EvalErr[0]PerSample = 0.51599997; TotalTime = 0.12335s; TotalTimePerSample = 0.49339ms; SamplesPerSecond = 2026
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71529394; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12333s; TotalTimePerSample = 0.49334ms; SamplesPerSecond = 2027
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71727639; EvalErr[0]PerSample = 0.53200001; TotalTime = 0.12471s; TotalTimePerSample = 0.49884ms; SamplesPerSecond = 2004
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71745312; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12359s; TotalTimePerSample = 0.49437ms; SamplesPerSecond = 2022
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72088087; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12196s; TotalTimePerSample = 0.48784ms; SamplesPerSecond = 2049
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.72006541; EvalErr[0]PerSample = 0.50800002; TotalTime = 0.12266s; TotalTimePerSample = 0.49064ms; SamplesPerSecond = 2038
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.71275192; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12160s; TotalTimePerSample = 0.48640ms; SamplesPerSecond = 2055
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69645119; EvalErr[0]PerSample = 0.50400001; TotalTime = 0.12120s; TotalTimePerSample = 0.48479ms; SamplesPerSecond = 2062
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70129883; EvalErr[0]PerSample = 0.51200002; TotalTime = 0.12089s; TotalTimePerSample = 0.48356ms; SamplesPerSecond = 2067
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70768166; EvalErr[0]PerSample = 0.54400003; TotalTime = 0.12162s; TotalTimePerSample = 0.48648ms; SamplesPerSecond = 2055
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69744140; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12267s; TotalTimePerSample = 0.49068ms; SamplesPerSecond = 2037
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69266015; EvalErr[0]PerSample = 0.44800001; TotalTime = 0.12178s; TotalTimePerSample = 0.48714ms; SamplesPerSecond = 2052
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69347072; EvalErr[0]PerSample = 0.49599999; TotalTime = 0.12231s; TotalTimePerSample = 0.48926ms; SamplesPerSecond = 2043
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69257420; EvalErr[0]PerSample = 0.54000002; TotalTime = 0.12288s; TotalTimePerSample = 0.49150ms; SamplesPerSecond = 2034
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.68625975; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12415s; TotalTimePerSample = 0.49658ms; SamplesPerSecond = 2013
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69064063; EvalErr[0]PerSample = 0.46799999; TotalTime = 0.12340s; TotalTimePerSample = 0.49359ms; SamplesPerSecond = 2025
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.70192385; EvalErr[0]PerSample = 0.46000001; TotalTime = 0.12176s; TotalTimePerSample = 0.48703ms; SamplesPerSecond = 2053
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.69058985; EvalErr[0]PerSample = 0.51999998; TotalTime = 0.12237s; TotalTimePerSample = 0.48949ms; SamplesPerSecond = 2042
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.67041212; EvalErr[0]PerSample = 0.39199999; TotalTime = 0.12186s; TotalTimePerSample = 0.48742ms; SamplesPerSecond = 2051
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.65914255; EvalErr[0]PerSample = 0.35600001; TotalTime = 0.12261s; TotalTimePerSample = 0.49044ms; SamplesPerSecond = 2038
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.63919920; EvalErr[0]PerSample = 0.36399999; TotalTime = 0.12265s; TotalTimePerSample = 0.49060ms; SamplesPerSecond = 2038
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.61294138; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48568ms; SamplesPerSecond = 2058
-MPI Rank 3:  Epoch[ 1 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.55255663; EvalErr[0]PerSample = 0.18799999; TotalTime = 0.12034s; TotalTimePerSample = 0.48134ms; SamplesPerSecond = 2077
-MPI Rank 3: Finished Epoch[ 1 of 10]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.47350001; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.253729
-MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 3: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal.
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[   1-  10 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12756s; TotalTimePerSample = 0.51025ms; SamplesPerSecond = 1959
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  11-  20 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12451s; TotalTimePerSample = 0.49804ms; SamplesPerSecond = 2007
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  21-  30 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12484s; TotalTimePerSample = 0.49937ms; SamplesPerSecond = 2002
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  31-  40 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49888ms; SamplesPerSecond = 2004
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  41-  50 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70605616; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12464s; TotalTimePerSample = 0.49857ms; SamplesPerSecond = 2005
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  51-  60 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12454s; TotalTimePerSample = 0.49814ms; SamplesPerSecond = 2007
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  61-  70 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.12437s; TotalTimePerSample = 0.49750ms; SamplesPerSecond = 2010
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  71-  80 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.79845605; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.12464s; TotalTimePerSample = 0.49854ms; SamplesPerSecond = 2005
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  81-  90 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12442s; TotalTimePerSample = 0.49766ms; SamplesPerSecond = 2009
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12491s; TotalTimePerSample = 0.49965ms; SamplesPerSecond = 2001
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71420344; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12493s; TotalTimePerSample = 0.49973ms; SamplesPerSecond = 2001
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12517s; TotalTimePerSample = 0.50070ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12504s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71857915; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.12456s; TotalTimePerSample = 0.49823ms; SamplesPerSecond = 2007
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.12477s; TotalTimePerSample = 0.49910ms; SamplesPerSecond = 2003
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71798839; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50086ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12412s; TotalTimePerSample = 0.49647ms; SamplesPerSecond = 2014
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71835127; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.12483s; TotalTimePerSample = 0.49930ms; SamplesPerSecond = 2002
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71529462; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12466s; TotalTimePerSample = 0.49865ms; SamplesPerSecond = 2005
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71727656; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12438s; TotalTimePerSample = 0.49752ms; SamplesPerSecond = 2009
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12477s; TotalTimePerSample = 0.49908ms; SamplesPerSecond = 2003
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088397; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.12416s; TotalTimePerSample = 0.49663ms; SamplesPerSecond = 2013
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72006808; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12445s; TotalTimePerSample = 0.49778ms; SamplesPerSecond = 2008
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12513s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.12463s; TotalTimePerSample = 0.49852ms; SamplesPerSecond = 2005
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70129697; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12494s; TotalTimePerSample = 0.49976ms; SamplesPerSecond = 2000
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12514s; TotalTimePerSample = 0.50055ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.12533s; TotalTimePerSample = 0.50132ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69266186; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.12545s; TotalTimePerSample = 0.50178ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12500s; TotalTimePerSample = 0.50000ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69257410; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12540s; TotalTimePerSample = 0.50159ms; SamplesPerSecond = 1993
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12518s; TotalTimePerSample = 0.50071ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12548s; TotalTimePerSample = 0.50190ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70192153; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12511s; TotalTimePerSample = 0.50045ms; SamplesPerSecond = 1998
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12552s; TotalTimePerSample = 0.50208ms; SamplesPerSecond = 1991
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.67041489; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12577s; TotalTimePerSample = 0.50310ms; SamplesPerSecond = 1987
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.65913971; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12536s; TotalTimePerSample = 0.50142ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.63919874; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.12523s; TotalTimePerSample = 0.50094ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.61293878; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.12515s; TotalTimePerSample = 0.50062ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.55255340; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.12492s; TotalTimePerSample = 0.49970ms; SamplesPerSecond = 2001
+MPI Rank 3: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.008585
+MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 3: starting epoch 1 at record count 10000, and file position 0
 MPI Rank 3: already there from last epoch
 MPI Rank 3: 
 MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.50775200; EvalErr[0]PerSample = 0.23999999; TotalTime = 0.12816s; TotalTimePerSample = 0.51265ms; SamplesPerSecond = 1950
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.43389454; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12188s; TotalTimePerSample = 0.48751ms; SamplesPerSecond = 2051
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.36675408; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12077s; TotalTimePerSample = 0.48307ms; SamplesPerSecond = 2070
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.33769274; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12254s; TotalTimePerSample = 0.49018ms; SamplesPerSecond = 2040
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.30321363; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12133s; TotalTimePerSample = 0.48531ms; SamplesPerSecond = 2060
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.29576379; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12053s; TotalTimePerSample = 0.48212ms; SamplesPerSecond = 2074
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.24924731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.24632569; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12328s; TotalTimePerSample = 0.49313ms; SamplesPerSecond = 2027
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20943311; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12287s; TotalTimePerSample = 0.49148ms; SamplesPerSecond = 2034
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19116065; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12193s; TotalTimePerSample = 0.48770ms; SamplesPerSecond = 2050
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17923315; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12243s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17075513; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12160s; TotalTimePerSample = 0.48638ms; SamplesPerSecond = 2055
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14442432; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12214s; TotalTimePerSample = 0.48854ms; SamplesPerSecond = 2046
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17753857; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12281s; TotalTimePerSample = 0.49123ms; SamplesPerSecond = 2035
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15087914; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12183s; TotalTimePerSample = 0.48733ms; SamplesPerSecond = 2052
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19252978; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12266s; TotalTimePerSample = 0.49063ms; SamplesPerSecond = 2038
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17830664; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49063ms; SamplesPerSecond = 2038
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15115429; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12254s; TotalTimePerSample = 0.49016ms; SamplesPerSecond = 2040
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19135889; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12208s; TotalTimePerSample = 0.48830ms; SamplesPerSecond = 2047
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.21491407; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12304s; TotalTimePerSample = 0.49215ms; SamplesPerSecond = 2031
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18682373; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12156s; TotalTimePerSample = 0.48625ms; SamplesPerSecond = 2056
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18483251; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12136s; TotalTimePerSample = 0.48543ms; SamplesPerSecond = 2060
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14684522; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15322119; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12307s; TotalTimePerSample = 0.49228ms; SamplesPerSecond = 2031
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19882520; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12373s; TotalTimePerSample = 0.49490ms; SamplesPerSecond = 2020
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13683788; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12241s; TotalTimePerSample = 0.48964ms; SamplesPerSecond = 2042
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18621191; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12195s; TotalTimePerSample = 0.48782ms; SamplesPerSecond = 2049
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19408056; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12169s; TotalTimePerSample = 0.48674ms; SamplesPerSecond = 2054
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17298096; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12217s; TotalTimePerSample = 0.48868ms; SamplesPerSecond = 2046
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13265137; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12193s; TotalTimePerSample = 0.48771ms; SamplesPerSecond = 2050
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17627051; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12156s; TotalTimePerSample = 0.48626ms; SamplesPerSecond = 2056
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12734570; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12410s; TotalTimePerSample = 0.49638ms; SamplesPerSecond = 2014
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15108399; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12115s; TotalTimePerSample = 0.48460ms; SamplesPerSecond = 2063
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19729199; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12268s; TotalTimePerSample = 0.49072ms; SamplesPerSecond = 2037
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12857373; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12203s; TotalTimePerSample = 0.48812ms; SamplesPerSecond = 2048
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13867822; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12273s; TotalTimePerSample = 0.49092ms; SamplesPerSecond = 2037
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12786084; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12172s; TotalTimePerSample = 0.48688ms; SamplesPerSecond = 2053
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16643262; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12109s; TotalTimePerSample = 0.48436ms; SamplesPerSecond = 2064
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20440333; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12237s; TotalTimePerSample = 0.48948ms; SamplesPerSecond = 2042
-MPI Rank 3:  Epoch[ 2 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14566259; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12233s; TotalTimePerSample = 0.48931ms; SamplesPerSecond = 2043
-MPI Rank 3: Finished Epoch[ 2 of 10]: [Training Set] TrainLossPerSample = 0.20373113; EvalErrPerSample = 0.082699999; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.929802
-MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.50774607; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.12567s; TotalTimePerSample = 0.50268ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.43388910; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12532s; TotalTimePerSample = 0.50129ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.36674852; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12549s; TotalTimePerSample = 0.50195ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.33768746; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12523s; TotalTimePerSample = 0.50094ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.30320932; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12505s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.29576032; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12563s; TotalTimePerSample = 0.50252ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24924483; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12490s; TotalTimePerSample = 0.49960ms; SamplesPerSecond = 2001
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24632409; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12564s; TotalTimePerSample = 0.50257ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20943152; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12525s; TotalTimePerSample = 0.50102ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19115992; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12537s; TotalTimePerSample = 0.50149ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17923227; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50018ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17075420; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12524s; TotalTimePerSample = 0.50098ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14442369; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17753818; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12514s; TotalTimePerSample = 0.50056ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15087853; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12521s; TotalTimePerSample = 0.50083ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19253021; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50136ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12533s; TotalTimePerSample = 0.50130ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12476s; TotalTimePerSample = 0.49904ms; SamplesPerSecond = 2003
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12523s; TotalTimePerSample = 0.50093ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.12494s; TotalTimePerSample = 0.49976ms; SamplesPerSecond = 2000
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12558s; TotalTimePerSample = 0.50232ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12477s; TotalTimePerSample = 0.49909ms; SamplesPerSecond = 2003
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14684503; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12463s; TotalTimePerSample = 0.49854ms; SamplesPerSecond = 2005
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15322116; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12531s; TotalTimePerSample = 0.50124ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12527s; TotalTimePerSample = 0.50109ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13683832; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12494s; TotalTimePerSample = 0.49975ms; SamplesPerSecond = 2000
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18621189; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12491s; TotalTimePerSample = 0.49962ms; SamplesPerSecond = 2001
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19408050; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12603s; TotalTimePerSample = 0.50412ms; SamplesPerSecond = 1983
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12517s; TotalTimePerSample = 0.50067ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12562s; TotalTimePerSample = 0.50249ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12516s; TotalTimePerSample = 0.50065ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12563s; TotalTimePerSample = 0.50253ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15108452; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12524s; TotalTimePerSample = 0.50094ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19729185; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12567s; TotalTimePerSample = 0.50269ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12857333; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12512s; TotalTimePerSample = 0.50048ms; SamplesPerSecond = 1998
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12546s; TotalTimePerSample = 0.50183ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12786051; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12557s; TotalTimePerSample = 0.50230ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12504s; TotalTimePerSample = 0.50015ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20440408; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12589s; TotalTimePerSample = 0.50354ms; SamplesPerSecond = 1985
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14566237; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12509s; TotalTimePerSample = 0.50036ms; SamplesPerSecond = 1998
+MPI Rank 3: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373026; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.013093
+MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 3: starting epoch 2 at record count 20000, and file position 0
 MPI Rank 3: already there from last epoch
 MPI Rank 3: 
 MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12590086; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12908s; TotalTimePerSample = 0.51633ms; SamplesPerSecond = 1936
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17780226; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12396s; TotalTimePerSample = 0.49584ms; SamplesPerSecond = 2016
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14417633; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12283s; TotalTimePerSample = 0.49131ms; SamplesPerSecond = 2035
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15796880; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12282s; TotalTimePerSample = 0.49129ms; SamplesPerSecond = 2035
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17002991; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12299s; TotalTimePerSample = 0.49194ms; SamplesPerSecond = 2032
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18262109; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12230s; TotalTimePerSample = 0.48920ms; SamplesPerSecond = 2044
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14643688; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12102s; TotalTimePerSample = 0.48407ms; SamplesPerSecond = 2065
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18030518; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12217s; TotalTimePerSample = 0.48868ms; SamplesPerSecond = 2046
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15846142; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12213s; TotalTimePerSample = 0.48853ms; SamplesPerSecond = 2046
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14486536; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12131s; TotalTimePerSample = 0.48526ms; SamplesPerSecond = 2060
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13469091; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12150s; TotalTimePerSample = 0.48601ms; SamplesPerSecond = 2057
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12147s; TotalTimePerSample = 0.48588ms; SamplesPerSecond = 2058
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.11641297; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12266s; TotalTimePerSample = 0.49064ms; SamplesPerSecond = 2038
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16786633; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12217s; TotalTimePerSample = 0.48867ms; SamplesPerSecond = 2046
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12811548; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17257836; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12251s; TotalTimePerSample = 0.49005ms; SamplesPerSecond = 2040
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17623682; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12008s; TotalTimePerSample = 0.48034ms; SamplesPerSecond = 2081
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14121118; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12156s; TotalTimePerSample = 0.48624ms; SamplesPerSecond = 2056
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19243409; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12071s; TotalTimePerSample = 0.48282ms; SamplesPerSecond = 2071
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20908155; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12149s; TotalTimePerSample = 0.48598ms; SamplesPerSecond = 2057
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18472095; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12149s; TotalTimePerSample = 0.48597ms; SamplesPerSecond = 2057
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18185547; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12187s; TotalTimePerSample = 0.48748ms; SamplesPerSecond = 2051
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14074194; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12174s; TotalTimePerSample = 0.48697ms; SamplesPerSecond = 2053
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14871632; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12167s; TotalTimePerSample = 0.48668ms; SamplesPerSecond = 2054
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20299682; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12216s; TotalTimePerSample = 0.48864ms; SamplesPerSecond = 2046
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12852076; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12055s; TotalTimePerSample = 0.48219ms; SamplesPerSecond = 2073
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18660498; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12199s; TotalTimePerSample = 0.48796ms; SamplesPerSecond = 2049
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19576025; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12239s; TotalTimePerSample = 0.48957ms; SamplesPerSecond = 2042
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16667627; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12266s; TotalTimePerSample = 0.49065ms; SamplesPerSecond = 2038
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12526172; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12058s; TotalTimePerSample = 0.48234ms; SamplesPerSecond = 2073
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17391992; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12146s; TotalTimePerSample = 0.48584ms; SamplesPerSecond = 2058
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12281641; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12141s; TotalTimePerSample = 0.48564ms; SamplesPerSecond = 2059
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14759424; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12210s; TotalTimePerSample = 0.48838ms; SamplesPerSecond = 2047
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19801368; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12210s; TotalTimePerSample = 0.48840ms; SamplesPerSecond = 2047
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12593359; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12242s; TotalTimePerSample = 0.48967ms; SamplesPerSecond = 2042
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13756640; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12193s; TotalTimePerSample = 0.48772ms; SamplesPerSecond = 2050
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48974ms; SamplesPerSecond = 2041
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16654395; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12273s; TotalTimePerSample = 0.49091ms; SamplesPerSecond = 2037
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20658936; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12091s; TotalTimePerSample = 0.48362ms; SamplesPerSecond = 2067
-MPI Rank 3:  Epoch[ 3 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14583300; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047
-MPI Rank 3: Finished Epoch[ 3 of 10]: [Training Set] TrainLossPerSample = 0.15948617; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.918964
-MPI Rank 3: Starting Epoch 4: learning rate per sample = 0.008000  momentum = 0.900001 
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12535s; TotalTimePerSample = 0.50138ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17780230; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12508s; TotalTimePerSample = 0.50032ms; SamplesPerSecond = 1998
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12562s; TotalTimePerSample = 0.50248ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15796896; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12536s; TotalTimePerSample = 0.50144ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17003000; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12537s; TotalTimePerSample = 0.50150ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12539s; TotalTimePerSample = 0.50157ms; SamplesPerSecond = 1993
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14643695; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12560s; TotalTimePerSample = 0.50240ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12594s; TotalTimePerSample = 0.50375ms; SamplesPerSecond = 1985
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12554s; TotalTimePerSample = 0.50217ms; SamplesPerSecond = 1991
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12555s; TotalTimePerSample = 0.50218ms; SamplesPerSecond = 1991
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13469094; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12560s; TotalTimePerSample = 0.50238ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12529s; TotalTimePerSample = 0.50114ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12511s; TotalTimePerSample = 0.50044ms; SamplesPerSecond = 1998
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12520s; TotalTimePerSample = 0.50081ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12589s; TotalTimePerSample = 0.50357ms; SamplesPerSecond = 1985
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12548s; TotalTimePerSample = 0.50190ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12516s; TotalTimePerSample = 0.50063ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19243442; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12554s; TotalTimePerSample = 0.50217ms; SamplesPerSecond = 1991
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12522s; TotalTimePerSample = 0.50090ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12552s; TotalTimePerSample = 0.50209ms; SamplesPerSecond = 1991
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18185536; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14074204; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12472s; TotalTimePerSample = 0.49888ms; SamplesPerSecond = 2004
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12523s; TotalTimePerSample = 0.50091ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20299705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12557s; TotalTimePerSample = 0.50228ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12852037; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12523s; TotalTimePerSample = 0.50090ms; SamplesPerSecond = 1996
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18660440; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12537s; TotalTimePerSample = 0.50148ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12483s; TotalTimePerSample = 0.49931ms; SamplesPerSecond = 2002
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12610s; TotalTimePerSample = 0.50441ms; SamplesPerSecond = 1982
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12526168; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12498s; TotalTimePerSample = 0.49993ms; SamplesPerSecond = 2000
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17392133; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12559s; TotalTimePerSample = 0.50237ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12513s; TotalTimePerSample = 0.50054ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12501s; TotalTimePerSample = 0.50004ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12531s; TotalTimePerSample = 0.50124ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12506s; TotalTimePerSample = 0.50023ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12535s; TotalTimePerSample = 0.50141ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12534s; TotalTimePerSample = 0.50137ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16654368; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12550s; TotalTimePerSample = 0.50199ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20658950; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12534s; TotalTimePerSample = 0.50137ms; SamplesPerSecond = 1994
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12566s; TotalTimePerSample = 0.50263ms; SamplesPerSecond = 1989
+MPI Rank 3: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.016289
+MPI Rank 3: Starting Epoch 4: learning rate per sample = 0.008000  effective momentum = 0.900000 
 MPI Rank 3: starting epoch 3 at record count 30000, and file position 0
 MPI Rank 3: already there from last epoch
 MPI Rank 3: 
 MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32).
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[   1-  10 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12751s; TotalTimePerSample = 0.51002ms; SamplesPerSecond = 1960
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  11-  20 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12299s; TotalTimePerSample = 0.49196ms; SamplesPerSecond = 2032
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  21-  30 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14239721; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12164s; TotalTimePerSample = 0.48655ms; SamplesPerSecond = 2055
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  31-  40 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15630139; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12202s; TotalTimePerSample = 0.48808ms; SamplesPerSecond = 2048
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  41-  50 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16935523; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12092s; TotalTimePerSample = 0.48368ms; SamplesPerSecond = 2067
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  51-  60 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18198816; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12084s; TotalTimePerSample = 0.48334ms; SamplesPerSecond = 2068
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  61-  70 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14475952; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12254s; TotalTimePerSample = 0.49015ms; SamplesPerSecond = 2040
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  71-  80 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18021594; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12209s; TotalTimePerSample = 0.48837ms; SamplesPerSecond = 2047
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  81-  90 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.15849304; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12483s; TotalTimePerSample = 0.49934ms; SamplesPerSecond = 2002
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[  91- 100 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14474402; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48872ms; SamplesPerSecond = 2046
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 101- 110 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13362928; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12226s; TotalTimePerSample = 0.48902ms; SamplesPerSecond = 2044
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 111- 120 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13708325; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12182s; TotalTimePerSample = 0.48728ms; SamplesPerSecond = 2052
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 121- 130 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.11569763; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12244s; TotalTimePerSample = 0.48977ms; SamplesPerSecond = 2041
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 131- 140 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16892321; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12146s; TotalTimePerSample = 0.48586ms; SamplesPerSecond = 2058
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 141- 150 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12752125; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12127s; TotalTimePerSample = 0.48510ms; SamplesPerSecond = 2061
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 151- 160 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17100880; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12211s; TotalTimePerSample = 0.48843ms; SamplesPerSecond = 2047
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 161- 170 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17660449; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12218s; TotalTimePerSample = 0.48873ms; SamplesPerSecond = 2046
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 171- 180 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14105836; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12168s; TotalTimePerSample = 0.48673ms; SamplesPerSecond = 2054
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 181- 190 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19333544; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12124s; TotalTimePerSample = 0.48496ms; SamplesPerSecond = 2062
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 191- 200 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20859498; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12069s; TotalTimePerSample = 0.48276ms; SamplesPerSecond = 2071
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 201- 210 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18499707; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12378s; TotalTimePerSample = 0.49513ms; SamplesPerSecond = 2019
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 211- 220 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18152441; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12165s; TotalTimePerSample = 0.48661ms; SamplesPerSecond = 2055
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 221- 230 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14037134; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12258s; TotalTimePerSample = 0.49032ms; SamplesPerSecond = 2039
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 231- 240 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14866894; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12390s; TotalTimePerSample = 0.49561ms; SamplesPerSecond = 2017
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 241- 250 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20347705; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12229s; TotalTimePerSample = 0.48917ms; SamplesPerSecond = 2044
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 251- 260 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12815039; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12245s; TotalTimePerSample = 0.48980ms; SamplesPerSecond = 2041
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 261- 270 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.18672803; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12288s; TotalTimePerSample = 0.49152ms; SamplesPerSecond = 2034
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 271- 280 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19552930; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12108s; TotalTimePerSample = 0.48432ms; SamplesPerSecond = 2064
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 281- 290 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16452637; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12195s; TotalTimePerSample = 0.48780ms; SamplesPerSecond = 2050
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 291- 300 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12461865; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12028s; TotalTimePerSample = 0.48110ms; SamplesPerSecond = 2078
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 301- 310 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.17285107; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12184s; TotalTimePerSample = 0.48736ms; SamplesPerSecond = 2051
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 311- 320 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12253613; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12352s; TotalTimePerSample = 0.49408ms; SamplesPerSecond = 2023
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 321- 330 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14723291; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12238s; TotalTimePerSample = 0.48952ms; SamplesPerSecond = 2042
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 331- 340 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.19789551; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12333s; TotalTimePerSample = 0.49332ms; SamplesPerSecond = 2027
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 341- 350 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12575878; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12335s; TotalTimePerSample = 0.49340ms; SamplesPerSecond = 2026
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 351- 360 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.13745947; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12227s; TotalTimePerSample = 0.48906ms; SamplesPerSecond = 2044
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 361- 370 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.12839746; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12152s; TotalTimePerSample = 0.48608ms; SamplesPerSecond = 2057
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 371- 380 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.16647315; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12209s; TotalTimePerSample = 0.48836ms; SamplesPerSecond = 2047
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 381- 390 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.20679444; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12143s; TotalTimePerSample = 0.48573ms; SamplesPerSecond = 2058
-MPI Rank 3:  Epoch[ 4 of 10]-Minibatch[ 391- 400 of -171798692]: SamplesSeen = 250; TrainLossPerSample =  0.14585204; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12219s; TotalTimePerSample = 0.48876ms; SamplesPerSecond = 2046
-MPI Rank 3: Finished Epoch[ 4 of 10]: [Training Set] TrainLossPerSample = 0.15914927; EvalErrPerSample = 0.076700002; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.927711
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12371233; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12546s; TotalTimePerSample = 0.50182ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18070514; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12545s; TotalTimePerSample = 0.50180ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14239731; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12565s; TotalTimePerSample = 0.50258ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12556s; TotalTimePerSample = 0.50224ms; SamplesPerSecond = 1991
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12543s; TotalTimePerSample = 0.50174ms; SamplesPerSecond = 1993
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12530s; TotalTimePerSample = 0.50121ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12540s; TotalTimePerSample = 0.50159ms; SamplesPerSecond = 1993
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18021602; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12464s; TotalTimePerSample = 0.49856ms; SamplesPerSecond = 2005
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12530s; TotalTimePerSample = 0.50120ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14474426; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12568s; TotalTimePerSample = 0.50272ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12452s; TotalTimePerSample = 0.49808ms; SamplesPerSecond = 2007
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13708300; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12525s; TotalTimePerSample = 0.50101ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11569776; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12514s; TotalTimePerSample = 0.50058ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16892330; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50234ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12752163; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12515s; TotalTimePerSample = 0.50059ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12566s; TotalTimePerSample = 0.50264ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12500s; TotalTimePerSample = 0.50002ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14105804; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12543s; TotalTimePerSample = 0.50172ms; SamplesPerSecond = 1993
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19333553; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12542s; TotalTimePerSample = 0.50168ms; SamplesPerSecond = 1993
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20859525; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12515s; TotalTimePerSample = 0.50059ms; SamplesPerSecond = 1997
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12599s; TotalTimePerSample = 0.50396ms; SamplesPerSecond = 1984
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50188ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14037158; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12508s; TotalTimePerSample = 0.50033ms; SamplesPerSecond = 1998
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12547s; TotalTimePerSample = 0.50189ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20347748; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12559s; TotalTimePerSample = 0.50236ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12815013; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12526s; TotalTimePerSample = 0.50104ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12549s; TotalTimePerSample = 0.50197ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19552989; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12511s; TotalTimePerSample = 0.50045ms; SamplesPerSecond = 1998
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12558s; TotalTimePerSample = 0.50232ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12496s; TotalTimePerSample = 0.49984ms; SamplesPerSecond = 2000
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12575s; TotalTimePerSample = 0.50300ms; SamplesPerSecond = 1988
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12561s; TotalTimePerSample = 0.50243ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14723333; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12548s; TotalTimePerSample = 0.50190ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12617s; TotalTimePerSample = 0.50467ms; SamplesPerSecond = 1981
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12609s; TotalTimePerSample = 0.50438ms; SamplesPerSecond = 1982
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12558s; TotalTimePerSample = 0.50231ms; SamplesPerSecond = 1990
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12498s; TotalTimePerSample = 0.49992ms; SamplesPerSecond = 2000
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12549s; TotalTimePerSample = 0.50196ms; SamplesPerSecond = 1992
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12527s; TotalTimePerSample = 0.50107ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12570s; TotalTimePerSample = 0.50282ms; SamplesPerSecond = 1988
+MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.018182
+MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
 MPI Rank 3: COMPLETED
 MPI Rank 3: ~MPIWrapper
diff --git a/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.windows.gpu.txt b/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.windows.gpu.txt
new file mode 100644
index 000000000..04e6c6109
--- /dev/null
+++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/baseline.windows.gpu.txt
@@ -0,0 +1,2449 @@
+=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 4 E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPIWrapper: initializing MPI
+MPIWrapper: initializing MPI
+MPIWrapper: initializing MPI
+MPIWrapper: initializing MPI
+ping [requestnodes (before change)]: 4 nodes pinging each other
+ping [requestnodes (before change)]: 4 nodes pinging each other
+ping [requestnodes (before change)]: 4 nodes pinging each other
+ping [requestnodes (before change)]: 4 nodes pinging each other
+ping [requestnodes (before change)]: all 4 nodes responded
+ping [requestnodes (before change)]: all 4 nodes responded
+ping [requestnodes (before change)]: all 4 nodes responded
+ping [requestnodes (before change)]: all 4 nodes responded
+requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (1) are in (participating)
+requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (3) are in (participating)
+requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (0) are in (participating)
+requestnodes [MPIWrapper]: using 4 out of 4 MPI nodes (4 requested); we (2) are in (participating)
+ping [requestnodes (after change)]: 4 nodes pinging each other
+ping [requestnodes (after change)]: 4 nodes pinging each other
+ping [requestnodes (after change)]: 4 nodes pinging each other
+ping [requestnodes (after change)]: 4 nodes pinging each other
+ping [requestnodes (after change)]: all 4 nodes responded
+ping [requestnodes (after change)]: all 4 nodes responded
+ping [requestnodes (after change)]: all 4 nodes responded
+ping [requestnodes (after change)]: all 4 nodes responded
+mpihelper: we are cog 0 in a gearbox of 4
+mpihelper: we are cog 2 in a gearbox of 4
+mpihelper: we are cog 3 in a gearbox of 4
+mpihelper: we are cog 1 in a gearbox of 4
+ping [mpihelper]: 4 nodes pinging each other
+ping [mpihelper]: 4 nodes pinging each other
+ping [mpihelper]: 4 nodes pinging each other
+ping [mpihelper]: 4 nodes pinging each other
+ping [mpihelper]: all 4 nodes responded
+ping [mpihelper]: all 4 nodes responded
+ping [mpihelper]: all 4 nodes responded
+ping [mpihelper]: all 4 nodes responded
+MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank0
+MPI Rank 0: -------------------------------------------------------------------
+MPI Rank 0: Build info: 
+MPI Rank 0: 
+MPI Rank 0: 		Built time: Oct 24 2015 13:33:25
+MPI Rank 0: 		Last modified date: Thu Oct 22 16:00:27 2015
+MPI Rank 0: 		Built by amitaga on Amitaga-Win-DT3           
+MPI Rank 0: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
+MPI Rank 0: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+MPI Rank 0: -------------------------------------------------------------------
+MPI Rank 0: running on Amitaga-Win-DT3 at 2015/10/24 21:49:38
+MPI Rank 0: command line: 
+MPI Rank 0: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr 
+MPI Rank 0: 
+MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+MPI Rank 0: deviceId=$DeviceId$
+MPI Rank 0: command=SimpleMultiGPU
+MPI Rank 0: precision=float
+MPI Rank 0: parallelTrain=true
+MPI Rank 0: SimpleMultiGPU=[
+MPI Rank 0:     action=train
+MPI Rank 0:     modelPath=$RunDir$/models/Simple.dnn
+MPI Rank 0:     deviceId=$DeviceId$
+MPI Rank 0:     traceLevel=1
+MPI Rank 0:     SimpleNetworkBuilder=[
+MPI Rank 0:         layerSizes=2:50*2:2
+MPI Rank 0:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 0:         evalCriterion=ErrorPrediction
+MPI Rank 0:         layerTypes=Sigmoid
+MPI Rank 0:         initValueScale=1.0
+MPI Rank 0:         applyMeanVarNorm=true
+MPI Rank 0:         uniformInit=true
+MPI Rank 0:         needPrior=true
+MPI Rank 0:     ]
+MPI Rank 0:     SGD=[
+MPI Rank 0:         epochSize=0 
+MPI Rank 0:         minibatchSize=25
+MPI Rank 0:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 0:         momentumPerMB=0.9
+MPI Rank 0:         dropoutRate=0.0
+MPI Rank 0:         maxEpochs=4
+MPI Rank 0:         ParallelTrain=[
+MPI Rank 0:             parallelizationMethod=DataParallelSGD
+MPI Rank 0:             DataParallelSGD=[
+MPI Rank 0:               gradientBits=1
+MPI Rank 0:             ]
+MPI Rank 0:         ]
+MPI Rank 0:     ]
+MPI Rank 0:     reader=[
+MPI Rank 0:       readerType=UCIFastReader
+MPI Rank 0:       file=$DataDir$/SimpleDataTrain.txt
+MPI Rank 0:       miniBatchMode=Partial
+MPI Rank 0:       randomize=None
+MPI Rank 0:       verbosity=1   
+MPI Rank 0:       features=[
+MPI Rank 0: dim=2      
+MPI Rank 0: start=0    
+MPI Rank 0:       ]
+MPI Rank 0:       labels=[
+MPI Rank 0: start=2      
+MPI Rank 0: dim=1        
+MPI Rank 0: labelDim=2   
+MPI Rank 0:         labelMappingFile=$DataDir$/SimpleMapping.txt
+MPI Rank 0:       ]
+MPI Rank 0:     ]
+MPI Rank 0: ]
+MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 0: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 0: DeviceId=0
+MPI Rank 0: precision=float
+MPI Rank 0: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 0: 
+MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+MPI Rank 0: 
+MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+MPI Rank 0: deviceId=0
+MPI Rank 0: command=SimpleMultiGPU
+MPI Rank 0: precision=float
+MPI Rank 0: parallelTrain=true
+MPI Rank 0: SimpleMultiGPU=[
+MPI Rank 0:     action=train
+MPI Rank 0:     modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 0:     deviceId=0
+MPI Rank 0:     traceLevel=1
+MPI Rank 0:     SimpleNetworkBuilder=[
+MPI Rank 0:         layerSizes=2:50*2:2
+MPI Rank 0:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 0:         evalCriterion=ErrorPrediction
+MPI Rank 0:         layerTypes=Sigmoid
+MPI Rank 0:         initValueScale=1.0
+MPI Rank 0:         applyMeanVarNorm=true
+MPI Rank 0:         uniformInit=true
+MPI Rank 0:         needPrior=true
+MPI Rank 0:     ]
+MPI Rank 0:     SGD=[
+MPI Rank 0:         epochSize=0 
+MPI Rank 0:         minibatchSize=25
+MPI Rank 0:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 0:         momentumPerMB=0.9
+MPI Rank 0:         dropoutRate=0.0
+MPI Rank 0:         maxEpochs=4
+MPI Rank 0:         ParallelTrain=[
+MPI Rank 0:             parallelizationMethod=DataParallelSGD
+MPI Rank 0:             DataParallelSGD=[
+MPI Rank 0:               gradientBits=1
+MPI Rank 0:             ]
+MPI Rank 0:         ]
+MPI Rank 0:     ]
+MPI Rank 0:     reader=[
+MPI Rank 0:       readerType=UCIFastReader
+MPI Rank 0:       file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 0:       miniBatchMode=Partial
+MPI Rank 0:       randomize=None
+MPI Rank 0:       verbosity=1   
+MPI Rank 0:       features=[
+MPI Rank 0: dim=2      
+MPI Rank 0: start=0    
+MPI Rank 0:       ]
+MPI Rank 0:       labels=[
+MPI Rank 0: start=2      
+MPI Rank 0: dim=1        
+MPI Rank 0: labelDim=2   
+MPI Rank 0:         labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 0:       ]
+MPI Rank 0:     ]
+MPI Rank 0: ]
+MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 0: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 0: DeviceId=0
+MPI Rank 0: precision=float
+MPI Rank 0: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 0: 
+MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+MPI Rank 0: 
+MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+MPI Rank 0: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU
+MPI Rank 0: configparameters: SimpleMultiGPU.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 0: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 0: configparameters: SimpleMultiGPU.config:deviceId=0
+MPI Rank 0: configparameters: SimpleMultiGPU.config:parallelTrain=true
+MPI Rank 0: configparameters: SimpleMultiGPU.config:precision=float
+MPI Rank 0: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 0: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[
+MPI Rank 0:     action=train
+MPI Rank 0:     modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 0:     deviceId=0
+MPI Rank 0:     traceLevel=1
+MPI Rank 0:     SimpleNetworkBuilder=[
+MPI Rank 0:         layerSizes=2:50*2:2
+MPI Rank 0:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 0:         evalCriterion=ErrorPrediction
+MPI Rank 0:         layerTypes=Sigmoid
+MPI Rank 0:         initValueScale=1.0
+MPI Rank 0:         applyMeanVarNorm=true
+MPI Rank 0:         uniformInit=true
+MPI Rank 0:         needPrior=true
+MPI Rank 0:     ]
+MPI Rank 0:     SGD=[
+MPI Rank 0:         epochSize=0 
+MPI Rank 0:         minibatchSize=25
+MPI Rank 0:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 0:         momentumPerMB=0.9
+MPI Rank 0:         dropoutRate=0.0
+MPI Rank 0:         maxEpochs=4
+MPI Rank 0:         ParallelTrain=[
+MPI Rank 0:             parallelizationMethod=DataParallelSGD
+MPI Rank 0:             DataParallelSGD=[
+MPI Rank 0:               gradientBits=1
+MPI Rank 0:             ]
+MPI Rank 0:         ]
+MPI Rank 0:     ]
+MPI Rank 0:     reader=[
+MPI Rank 0:       readerType=UCIFastReader
+MPI Rank 0:       file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 0:       miniBatchMode=Partial
+MPI Rank 0:       randomize=None
+MPI Rank 0:       verbosity=1   
+MPI Rank 0:       features=[
+MPI Rank 0: dim=2      
+MPI Rank 0: start=0    
+MPI Rank 0:       ]
+MPI Rank 0:       labels=[
+MPI Rank 0: start=2      
+MPI Rank 0: dim=1        
+MPI Rank 0: labelDim=2   
+MPI Rank 0:         labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 0:       ]
+MPI Rank 0:     ]
+MPI Rank 0: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 0: 
+MPI Rank 0: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+MPI Rank 0: command: SimpleMultiGPU 
+MPI Rank 0: precision = float
+MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 0: CNTKCommandTrainInfo: SimpleMultiGPU : 4
+MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+MPI Rank 0: CNTKCommandTrainBegin: SimpleMultiGPU
+MPI Rank 0: SimpleNetworkBuilder Using GPU 0
+MPI Rank 0: reading uci file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1
+MPI Rank 0: GetTrainCriterionNodes  ...
+MPI Rank 0: GetEvalCriterionNodes  ...
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 0: 
+MPI Rank 0: 	NodeName: InvStdOfFeatures
+MPI Rank 0: 	NodeName: MeanOfFeatures
+MPI Rank 0: 	NodeName: Prior
+MPI Rank 0: starting at epoch 0 counting lines to determine record count
+MPI Rank 0: 
+MPI Rank 0:  10000 records found
+MPI Rank 0: starting epoch 0 at record count 0, and file position 0
+MPI Rank 0: already there from last epoch
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
+MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node InvStdOfFeatures, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
+MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node MeanOfFeatures, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Validating for node Prior. 2 nodes to process in pass 1.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node Prior, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 0: 
+MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Precomputing --> Completed.
+MPI Rank 0: 
+MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
+MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000 
+MPI Rank 0: starting epoch 0 at record count 0, and file position 0
+MPI Rank 0: already there from last epoch
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 0: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 0: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal.
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[   1-  10 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.21274s; TotalTimePerSample = 0.85096ms; SamplesPerSecond = 1175
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  11-  20 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.17175s; TotalTimePerSample = 0.68700ms; SamplesPerSecond = 1455
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  21-  30 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.16870s; TotalTimePerSample = 0.67482ms; SamplesPerSecond = 1481
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  31-  40 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.16118s; TotalTimePerSample = 0.64471ms; SamplesPerSecond = 1551
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  41-  50 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70605615; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.16159s; TotalTimePerSample = 0.64636ms; SamplesPerSecond = 1547
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  51-  60 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.15234s; TotalTimePerSample = 0.60934ms; SamplesPerSecond = 1641
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  61-  70 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.14166s; TotalTimePerSample = 0.56666ms; SamplesPerSecond = 1764
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  71-  80 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.79845604; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.14173s; TotalTimePerSample = 0.56692ms; SamplesPerSecond = 1763
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  81-  90 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.13864s; TotalTimePerSample = 0.55454ms; SamplesPerSecond = 1803
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[  91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12738s; TotalTimePerSample = 0.50953ms; SamplesPerSecond = 1962
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71420345; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.13068s; TotalTimePerSample = 0.52273ms; SamplesPerSecond = 1913
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12783s; TotalTimePerSample = 0.51133ms; SamplesPerSecond = 1955
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12405s; TotalTimePerSample = 0.49618ms; SamplesPerSecond = 2015
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71857916; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.11956s; TotalTimePerSample = 0.47822ms; SamplesPerSecond = 2091
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.11897s; TotalTimePerSample = 0.47589ms; SamplesPerSecond = 2101
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71798840; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.11620s; TotalTimePerSample = 0.46478ms; SamplesPerSecond = 2151
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11800s; TotalTimePerSample = 0.47198ms; SamplesPerSecond = 2118
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71835128; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.11839s; TotalTimePerSample = 0.47358ms; SamplesPerSecond = 2111
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71529463; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12527s; TotalTimePerSample = 0.50107ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71727657; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.11979s; TotalTimePerSample = 0.47917ms; SamplesPerSecond = 2086
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11901s; TotalTimePerSample = 0.47605ms; SamplesPerSecond = 2100
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088398; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11482s; TotalTimePerSample = 0.45926ms; SamplesPerSecond = 2177
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72006809; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12406s; TotalTimePerSample = 0.49625ms; SamplesPerSecond = 2015
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12120s; TotalTimePerSample = 0.48478ms; SamplesPerSecond = 2062
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11779s; TotalTimePerSample = 0.47117ms; SamplesPerSecond = 2122
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70129698; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12147s; TotalTimePerSample = 0.48589ms; SamplesPerSecond = 2058
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12140s; TotalTimePerSample = 0.48561ms; SamplesPerSecond = 2059
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.11737s; TotalTimePerSample = 0.46946ms; SamplesPerSecond = 2130
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69266187; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.11798s; TotalTimePerSample = 0.47194ms; SamplesPerSecond = 2118
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12121s; TotalTimePerSample = 0.48485ms; SamplesPerSecond = 2062
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69257409; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12546s; TotalTimePerSample = 0.50186ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12595s; TotalTimePerSample = 0.50380ms; SamplesPerSecond = 1984
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12351s; TotalTimePerSample = 0.49405ms; SamplesPerSecond = 2024
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70192154; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12384s; TotalTimePerSample = 0.49536ms; SamplesPerSecond = 2018
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12360s; TotalTimePerSample = 0.49439ms; SamplesPerSecond = 2022
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.67041492; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12239s; TotalTimePerSample = 0.48957ms; SamplesPerSecond = 2042
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.65913973; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12837s; TotalTimePerSample = 0.51346ms; SamplesPerSecond = 1947
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.63919877; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.14480s; TotalTimePerSample = 0.57919ms; SamplesPerSecond = 1726
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.61293883; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.14498s; TotalTimePerSample = 0.57991ms; SamplesPerSecond = 1724
+MPI Rank 0:  Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.55255352; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.14514s; TotalTimePerSample = 0.58057ms; SamplesPerSecond = 1722
+MPI Rank 0: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.368447
+MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 0: starting epoch 1 at record count 10000, and file position 0
+MPI Rank 0: already there from last epoch
+MPI Rank 0: 
+MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.50774625; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.13805s; TotalTimePerSample = 0.55221ms; SamplesPerSecond = 1810
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.43388927; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.13897s; TotalTimePerSample = 0.55587ms; SamplesPerSecond = 1798
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.36674870; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14035s; TotalTimePerSample = 0.56139ms; SamplesPerSecond = 1781
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.33768765; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.13735s; TotalTimePerSample = 0.54939ms; SamplesPerSecond = 1820
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.30320946; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12889s; TotalTimePerSample = 0.51557ms; SamplesPerSecond = 1939
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.29576043; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.14907s; TotalTimePerSample = 0.59630ms; SamplesPerSecond = 1677
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24924491; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14579s; TotalTimePerSample = 0.58317ms; SamplesPerSecond = 1714
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24632415; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14396s; TotalTimePerSample = 0.57584ms; SamplesPerSecond = 1736
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20943158; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13623s; TotalTimePerSample = 0.54492ms; SamplesPerSecond = 1835
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19115996; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13589s; TotalTimePerSample = 0.54355ms; SamplesPerSecond = 1839
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17923231; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13511s; TotalTimePerSample = 0.54042ms; SamplesPerSecond = 1850
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17075422; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13630s; TotalTimePerSample = 0.54521ms; SamplesPerSecond = 1834
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14442371; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.13115s; TotalTimePerSample = 0.52461ms; SamplesPerSecond = 1906
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17753819; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.13713s; TotalTimePerSample = 0.54852ms; SamplesPerSecond = 1823
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15087855; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13776s; TotalTimePerSample = 0.55104ms; SamplesPerSecond = 1814
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19253023; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13519s; TotalTimePerSample = 0.54077ms; SamplesPerSecond = 1849
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14500s; TotalTimePerSample = 0.58000ms; SamplesPerSecond = 1724
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.14318s; TotalTimePerSample = 0.57271ms; SamplesPerSecond = 1746
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14350s; TotalTimePerSample = 0.57402ms; SamplesPerSecond = 1742
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14562s; TotalTimePerSample = 0.58247ms; SamplesPerSecond = 1716
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13541s; TotalTimePerSample = 0.54166ms; SamplesPerSecond = 1846
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14328s; TotalTimePerSample = 0.57310ms; SamplesPerSecond = 1744
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14684504; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.14565s; TotalTimePerSample = 0.58259ms; SamplesPerSecond = 1716
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15322115; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14666s; TotalTimePerSample = 0.58663ms; SamplesPerSecond = 1704
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.14043s; TotalTimePerSample = 0.56172ms; SamplesPerSecond = 1780
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13683833; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14034s; TotalTimePerSample = 0.56138ms; SamplesPerSecond = 1781
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18621188; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13422s; TotalTimePerSample = 0.53688ms; SamplesPerSecond = 1862
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19408048; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13252s; TotalTimePerSample = 0.53009ms; SamplesPerSecond = 1886
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14529s; TotalTimePerSample = 0.58114ms; SamplesPerSecond = 1720
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.13280s; TotalTimePerSample = 0.53118ms; SamplesPerSecond = 1882
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.13158s; TotalTimePerSample = 0.52633ms; SamplesPerSecond = 1899
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12716s; TotalTimePerSample = 0.50865ms; SamplesPerSecond = 1965
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15108451; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12641s; TotalTimePerSample = 0.50564ms; SamplesPerSecond = 1977
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19729184; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12382s; TotalTimePerSample = 0.49529ms; SamplesPerSecond = 2019
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12857332; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12221s; TotalTimePerSample = 0.48882ms; SamplesPerSecond = 2045
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12107s; TotalTimePerSample = 0.48428ms; SamplesPerSecond = 2064
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12786050; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12283s; TotalTimePerSample = 0.49134ms; SamplesPerSecond = 2035
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12578s; TotalTimePerSample = 0.50312ms; SamplesPerSecond = 1987
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20440409; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12240s; TotalTimePerSample = 0.48960ms; SamplesPerSecond = 2042
+MPI Rank 0:  Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14566238; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12173s; TotalTimePerSample = 0.48692ms; SamplesPerSecond = 2053
+MPI Rank 0: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373029; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.454988
+MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 0: starting epoch 2 at record count 20000, and file position 0
+MPI Rank 0: already there from last epoch
+MPI Rank 0: 
+MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11986s; TotalTimePerSample = 0.47944ms; SamplesPerSecond = 2085
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17780229; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12511s; TotalTimePerSample = 0.50043ms; SamplesPerSecond = 1998
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12349s; TotalTimePerSample = 0.49395ms; SamplesPerSecond = 2024
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15796895; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12131s; TotalTimePerSample = 0.48522ms; SamplesPerSecond = 2060
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17002999; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12305s; TotalTimePerSample = 0.49220ms; SamplesPerSecond = 2031
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11573s; TotalTimePerSample = 0.46292ms; SamplesPerSecond = 2160
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14643694; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11568s; TotalTimePerSample = 0.46271ms; SamplesPerSecond = 2161
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12346s; TotalTimePerSample = 0.49385ms; SamplesPerSecond = 2024
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12240s; TotalTimePerSample = 0.48958ms; SamplesPerSecond = 2042
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12005s; TotalTimePerSample = 0.48020ms; SamplesPerSecond = 2082
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13469093; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11867s; TotalTimePerSample = 0.47468ms; SamplesPerSecond = 2106
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12647s; TotalTimePerSample = 0.50587ms; SamplesPerSecond = 1976
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12012s; TotalTimePerSample = 0.48047ms; SamplesPerSecond = 2081
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.11872s; TotalTimePerSample = 0.47488ms; SamplesPerSecond = 2105
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12433s; TotalTimePerSample = 0.49730ms; SamplesPerSecond = 2010
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.11777s; TotalTimePerSample = 0.47109ms; SamplesPerSecond = 2122
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12099s; TotalTimePerSample = 0.48397ms; SamplesPerSecond = 2066
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12178s; TotalTimePerSample = 0.48710ms; SamplesPerSecond = 2052
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19243443; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12140s; TotalTimePerSample = 0.48558ms; SamplesPerSecond = 2059
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12905s; TotalTimePerSample = 0.51618ms; SamplesPerSecond = 1937
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12856s; TotalTimePerSample = 0.51424ms; SamplesPerSecond = 1944
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18185535; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12184s; TotalTimePerSample = 0.48735ms; SamplesPerSecond = 2051
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14074205; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.13076s; TotalTimePerSample = 0.52305ms; SamplesPerSecond = 1911
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11964s; TotalTimePerSample = 0.47857ms; SamplesPerSecond = 2089
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20299704; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12301s; TotalTimePerSample = 0.49202ms; SamplesPerSecond = 2032
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12852038; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12072s; TotalTimePerSample = 0.48289ms; SamplesPerSecond = 2070
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18660439; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12280s; TotalTimePerSample = 0.49120ms; SamplesPerSecond = 2035
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12056s; TotalTimePerSample = 0.48223ms; SamplesPerSecond = 2073
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12549s; TotalTimePerSample = 0.50197ms; SamplesPerSecond = 1992
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12526169; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12500s; TotalTimePerSample = 0.50000ms; SamplesPerSecond = 2000
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17392131; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12278s; TotalTimePerSample = 0.49111ms; SamplesPerSecond = 2036
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12280s; TotalTimePerSample = 0.49121ms; SamplesPerSecond = 2035
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.11641s; TotalTimePerSample = 0.46564ms; SamplesPerSecond = 2147
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12504s; TotalTimePerSample = 0.50017ms; SamplesPerSecond = 1999
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12272s; TotalTimePerSample = 0.49090ms; SamplesPerSecond = 2037
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12083s; TotalTimePerSample = 0.48331ms; SamplesPerSecond = 2069
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12856s; TotalTimePerSample = 0.51424ms; SamplesPerSecond = 1944
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16654369; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.13337s; TotalTimePerSample = 0.53348ms; SamplesPerSecond = 1874
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20658951; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12150s; TotalTimePerSample = 0.48602ms; SamplesPerSecond = 2057
+MPI Rank 0:  Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.11497s; TotalTimePerSample = 0.45986ms; SamplesPerSecond = 2174
+MPI Rank 0: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.926197
+MPI Rank 0: Starting Epoch 4: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 0: starting epoch 3 at record count 30000, and file position 0
+MPI Rank 0: already there from last epoch
+MPI Rank 0: 
+MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12371232; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12039s; TotalTimePerSample = 0.48155ms; SamplesPerSecond = 2076
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12069s; TotalTimePerSample = 0.48277ms; SamplesPerSecond = 2071
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14239730; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12713s; TotalTimePerSample = 0.50853ms; SamplesPerSecond = 1966
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.11918s; TotalTimePerSample = 0.47671ms; SamplesPerSecond = 2097
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11887s; TotalTimePerSample = 0.47550ms; SamplesPerSecond = 2103
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12215s; TotalTimePerSample = 0.48859ms; SamplesPerSecond = 2046
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12015s; TotalTimePerSample = 0.48061ms; SamplesPerSecond = 2080
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18021601; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11829s; TotalTimePerSample = 0.47316ms; SamplesPerSecond = 2113
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11851s; TotalTimePerSample = 0.47403ms; SamplesPerSecond = 2109
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14474425; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12129s; TotalTimePerSample = 0.48518ms; SamplesPerSecond = 2061
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12019s; TotalTimePerSample = 0.48075ms; SamplesPerSecond = 2080
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13708299; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12225s; TotalTimePerSample = 0.48900ms; SamplesPerSecond = 2045
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11569777; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12067s; TotalTimePerSample = 0.48266ms; SamplesPerSecond = 2071
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16892331; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.11980s; TotalTimePerSample = 0.47918ms; SamplesPerSecond = 2086
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12752162; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12049s; TotalTimePerSample = 0.48195ms; SamplesPerSecond = 2074
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12280s; TotalTimePerSample = 0.49119ms; SamplesPerSecond = 2035
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.11920s; TotalTimePerSample = 0.47680ms; SamplesPerSecond = 2097
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14105803; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12263s; TotalTimePerSample = 0.49050ms; SamplesPerSecond = 2038
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19333552; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12588s; TotalTimePerSample = 0.50350ms; SamplesPerSecond = 1986
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20859524; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14396s; TotalTimePerSample = 0.57582ms; SamplesPerSecond = 1736
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12856s; TotalTimePerSample = 0.51423ms; SamplesPerSecond = 1944
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12753s; TotalTimePerSample = 0.51010ms; SamplesPerSecond = 1960
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14037157; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11991s; TotalTimePerSample = 0.47965ms; SamplesPerSecond = 2084
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12057s; TotalTimePerSample = 0.48229ms; SamplesPerSecond = 2073
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20347747; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.11798s; TotalTimePerSample = 0.47193ms; SamplesPerSecond = 2118
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12815012; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12094s; TotalTimePerSample = 0.48378ms; SamplesPerSecond = 2067
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12091s; TotalTimePerSample = 0.48365ms; SamplesPerSecond = 2067
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19552990; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12224s; TotalTimePerSample = 0.48896ms; SamplesPerSecond = 2045
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12373s; TotalTimePerSample = 0.49491ms; SamplesPerSecond = 2020
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12109s; TotalTimePerSample = 0.48435ms; SamplesPerSecond = 2064
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12231s; TotalTimePerSample = 0.48923ms; SamplesPerSecond = 2044
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12255s; TotalTimePerSample = 0.49020ms; SamplesPerSecond = 2039
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14723334; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12410s; TotalTimePerSample = 0.49640ms; SamplesPerSecond = 2014
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12264s; TotalTimePerSample = 0.49054ms; SamplesPerSecond = 2038
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12440s; TotalTimePerSample = 0.49760ms; SamplesPerSecond = 2009
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12623s; TotalTimePerSample = 0.50490ms; SamplesPerSecond = 1980
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12027s; TotalTimePerSample = 0.48109ms; SamplesPerSecond = 2078
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12139s; TotalTimePerSample = 0.48556ms; SamplesPerSecond = 2059
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12487s; TotalTimePerSample = 0.49948ms; SamplesPerSecond = 2002
+MPI Rank 0:  Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12520s; TotalTimePerSample = 0.50081ms; SamplesPerSecond = 1996
+MPI Rank 0: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.931563
+MPI Rank 0: CNTKCommandTrainEnd: SimpleMultiGPU
+MPI Rank 0: COMPLETED
+MPI Rank 0: ~MPIWrapper
+MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank1
+MPI Rank 1: -------------------------------------------------------------------
+MPI Rank 1: Build info: 
+MPI Rank 1: 
+MPI Rank 1: 		Built time: Oct 24 2015 13:33:25
+MPI Rank 1: 		Last modified date: Thu Oct 22 16:00:27 2015
+MPI Rank 1: 		Built by amitaga on Amitaga-Win-DT3           
+MPI Rank 1: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
+MPI Rank 1: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+MPI Rank 1: -------------------------------------------------------------------
+MPI Rank 1: running on Amitaga-Win-DT3 at 2015/10/24 21:49:38
+MPI Rank 1: command line: 
+MPI Rank 1: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr 
+MPI Rank 1: 
+MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+MPI Rank 1: deviceId=$DeviceId$
+MPI Rank 1: command=SimpleMultiGPU
+MPI Rank 1: precision=float
+MPI Rank 1: parallelTrain=true
+MPI Rank 1: SimpleMultiGPU=[
+MPI Rank 1:     action=train
+MPI Rank 1:     modelPath=$RunDir$/models/Simple.dnn
+MPI Rank 1:     deviceId=$DeviceId$
+MPI Rank 1:     traceLevel=1
+MPI Rank 1:     SimpleNetworkBuilder=[
+MPI Rank 1:         layerSizes=2:50*2:2
+MPI Rank 1:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 1:         evalCriterion=ErrorPrediction
+MPI Rank 1:         layerTypes=Sigmoid
+MPI Rank 1:         initValueScale=1.0
+MPI Rank 1:         applyMeanVarNorm=true
+MPI Rank 1:         uniformInit=true
+MPI Rank 1:         needPrior=true
+MPI Rank 1:     ]
+MPI Rank 1:     SGD=[
+MPI Rank 1:         epochSize=0 
+MPI Rank 1:         minibatchSize=25
+MPI Rank 1:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 1:         momentumPerMB=0.9
+MPI Rank 1:         dropoutRate=0.0
+MPI Rank 1:         maxEpochs=4
+MPI Rank 1:         ParallelTrain=[
+MPI Rank 1:             parallelizationMethod=DataParallelSGD
+MPI Rank 1:             DataParallelSGD=[
+MPI Rank 1:               gradientBits=1
+MPI Rank 1:             ]
+MPI Rank 1:         ]
+MPI Rank 1:     ]
+MPI Rank 1:     reader=[
+MPI Rank 1:       readerType=UCIFastReader
+MPI Rank 1:       file=$DataDir$/SimpleDataTrain.txt
+MPI Rank 1:       miniBatchMode=Partial
+MPI Rank 1:       randomize=None
+MPI Rank 1:       verbosity=1   
+MPI Rank 1:       features=[
+MPI Rank 1: dim=2      
+MPI Rank 1: start=0    
+MPI Rank 1:       ]
+MPI Rank 1:       labels=[
+MPI Rank 1: start=2      
+MPI Rank 1: dim=1        
+MPI Rank 1: labelDim=2   
+MPI Rank 1:         labelMappingFile=$DataDir$/SimpleMapping.txt
+MPI Rank 1:       ]
+MPI Rank 1:     ]
+MPI Rank 1: ]
+MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 1: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 1: DeviceId=0
+MPI Rank 1: precision=float
+MPI Rank 1: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 1: 
+MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+MPI Rank 1: 
+MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+MPI Rank 1: deviceId=0
+MPI Rank 1: command=SimpleMultiGPU
+MPI Rank 1: precision=float
+MPI Rank 1: parallelTrain=true
+MPI Rank 1: SimpleMultiGPU=[
+MPI Rank 1:     action=train
+MPI Rank 1:     modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 1:     deviceId=0
+MPI Rank 1:     traceLevel=1
+MPI Rank 1:     SimpleNetworkBuilder=[
+MPI Rank 1:         layerSizes=2:50*2:2
+MPI Rank 1:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 1:         evalCriterion=ErrorPrediction
+MPI Rank 1:         layerTypes=Sigmoid
+MPI Rank 1:         initValueScale=1.0
+MPI Rank 1:         applyMeanVarNorm=true
+MPI Rank 1:         uniformInit=true
+MPI Rank 1:         needPrior=true
+MPI Rank 1:     ]
+MPI Rank 1:     SGD=[
+MPI Rank 1:         epochSize=0 
+MPI Rank 1:         minibatchSize=25
+MPI Rank 1:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 1:         momentumPerMB=0.9
+MPI Rank 1:         dropoutRate=0.0
+MPI Rank 1:         maxEpochs=4
+MPI Rank 1:         ParallelTrain=[
+MPI Rank 1:             parallelizationMethod=DataParallelSGD
+MPI Rank 1:             DataParallelSGD=[
+MPI Rank 1:               gradientBits=1
+MPI Rank 1:             ]
+MPI Rank 1:         ]
+MPI Rank 1:     ]
+MPI Rank 1:     reader=[
+MPI Rank 1:       readerType=UCIFastReader
+MPI Rank 1:       file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 1:       miniBatchMode=Partial
+MPI Rank 1:       randomize=None
+MPI Rank 1:       verbosity=1   
+MPI Rank 1:       features=[
+MPI Rank 1: dim=2      
+MPI Rank 1: start=0    
+MPI Rank 1:       ]
+MPI Rank 1:       labels=[
+MPI Rank 1: start=2      
+MPI Rank 1: dim=1        
+MPI Rank 1: labelDim=2   
+MPI Rank 1:         labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 1:       ]
+MPI Rank 1:     ]
+MPI Rank 1: ]
+MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 1: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 1: DeviceId=0
+MPI Rank 1: precision=float
+MPI Rank 1: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 1: 
+MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+MPI Rank 1: 
+MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+MPI Rank 1: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU
+MPI Rank 1: configparameters: SimpleMultiGPU.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 1: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 1: configparameters: SimpleMultiGPU.config:deviceId=0
+MPI Rank 1: configparameters: SimpleMultiGPU.config:parallelTrain=true
+MPI Rank 1: configparameters: SimpleMultiGPU.config:precision=float
+MPI Rank 1: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 1: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[
+MPI Rank 1:     action=train
+MPI Rank 1:     modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 1:     deviceId=0
+MPI Rank 1:     traceLevel=1
+MPI Rank 1:     SimpleNetworkBuilder=[
+MPI Rank 1:         layerSizes=2:50*2:2
+MPI Rank 1:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 1:         evalCriterion=ErrorPrediction
+MPI Rank 1:         layerTypes=Sigmoid
+MPI Rank 1:         initValueScale=1.0
+MPI Rank 1:         applyMeanVarNorm=true
+MPI Rank 1:         uniformInit=true
+MPI Rank 1:         needPrior=true
+MPI Rank 1:     ]
+MPI Rank 1:     SGD=[
+MPI Rank 1:         epochSize=0 
+MPI Rank 1:         minibatchSize=25
+MPI Rank 1:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 1:         momentumPerMB=0.9
+MPI Rank 1:         dropoutRate=0.0
+MPI Rank 1:         maxEpochs=4
+MPI Rank 1:         ParallelTrain=[
+MPI Rank 1:             parallelizationMethod=DataParallelSGD
+MPI Rank 1:             DataParallelSGD=[
+MPI Rank 1:               gradientBits=1
+MPI Rank 1:             ]
+MPI Rank 1:         ]
+MPI Rank 1:     ]
+MPI Rank 1:     reader=[
+MPI Rank 1:       readerType=UCIFastReader
+MPI Rank 1:       file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 1:       miniBatchMode=Partial
+MPI Rank 1:       randomize=None
+MPI Rank 1:       verbosity=1   
+MPI Rank 1:       features=[
+MPI Rank 1: dim=2      
+MPI Rank 1: start=0    
+MPI Rank 1:       ]
+MPI Rank 1:       labels=[
+MPI Rank 1: start=2      
+MPI Rank 1: dim=1        
+MPI Rank 1: labelDim=2   
+MPI Rank 1:         labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 1:       ]
+MPI Rank 1:     ]
+MPI Rank 1: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 1: 
+MPI Rank 1: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+MPI Rank 1: command: SimpleMultiGPU 
+MPI Rank 1: precision = float
+MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 1: CNTKCommandTrainInfo: SimpleMultiGPU : 4
+MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+MPI Rank 1: CNTKCommandTrainBegin: SimpleMultiGPU
+MPI Rank 1: SimpleNetworkBuilder Using GPU 0
+MPI Rank 1: reading uci file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1
+MPI Rank 1: GetTrainCriterionNodes  ...
+MPI Rank 1: GetEvalCriterionNodes  ...
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 1: 
+MPI Rank 1: 	NodeName: InvStdOfFeatures
+MPI Rank 1: 	NodeName: MeanOfFeatures
+MPI Rank 1: 	NodeName: Prior
+MPI Rank 1: starting at epoch 0 counting lines to determine record count
+MPI Rank 1: 
+MPI Rank 1:  10000 records found
+MPI Rank 1: starting epoch 0 at record count 0, and file position 0
+MPI Rank 1: already there from last epoch
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
+MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node InvStdOfFeatures, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
+MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node MeanOfFeatures, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Validating for node Prior. 2 nodes to process in pass 1.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node Prior, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 1: 
+MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Precomputing --> Completed.
+MPI Rank 1: 
+MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
+MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000 
+MPI Rank 1: starting epoch 0 at record count 0, and file position 0
+MPI Rank 1: already there from last epoch
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 1: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 1: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal.
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[   1-  10 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.21240s; TotalTimePerSample = 0.84959ms; SamplesPerSecond = 1177
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  11-  20 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.17214s; TotalTimePerSample = 0.68857ms; SamplesPerSecond = 1452
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  21-  30 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.16860s; TotalTimePerSample = 0.67442ms; SamplesPerSecond = 1482
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  31-  40 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.16047s; TotalTimePerSample = 0.64187ms; SamplesPerSecond = 1557
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  41-  50 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70605615; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.16232s; TotalTimePerSample = 0.64926ms; SamplesPerSecond = 1540
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  51-  60 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.15152s; TotalTimePerSample = 0.60609ms; SamplesPerSecond = 1649
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  61-  70 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.14175s; TotalTimePerSample = 0.56700ms; SamplesPerSecond = 1763
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  71-  80 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.79845604; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.14218s; TotalTimePerSample = 0.56872ms; SamplesPerSecond = 1758
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  81-  90 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.13820s; TotalTimePerSample = 0.55280ms; SamplesPerSecond = 1808
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[  91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12737s; TotalTimePerSample = 0.50948ms; SamplesPerSecond = 1962
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71420345; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.13145s; TotalTimePerSample = 0.52581ms; SamplesPerSecond = 1901
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12711s; TotalTimePerSample = 0.50843ms; SamplesPerSecond = 1966
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12442s; TotalTimePerSample = 0.49766ms; SamplesPerSecond = 2009
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71857916; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.11861s; TotalTimePerSample = 0.47442ms; SamplesPerSecond = 2107
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.11955s; TotalTimePerSample = 0.47821ms; SamplesPerSecond = 2091
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71798840; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.11551s; TotalTimePerSample = 0.46204ms; SamplesPerSecond = 2164
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11801s; TotalTimePerSample = 0.47202ms; SamplesPerSecond = 2118
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71835128; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.11850s; TotalTimePerSample = 0.47402ms; SamplesPerSecond = 2109
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71529463; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12523s; TotalTimePerSample = 0.50093ms; SamplesPerSecond = 1996
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71727657; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12007s; TotalTimePerSample = 0.48030ms; SamplesPerSecond = 2082
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11898s; TotalTimePerSample = 0.47592ms; SamplesPerSecond = 2101
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088398; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11488s; TotalTimePerSample = 0.45952ms; SamplesPerSecond = 2176
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72006809; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12370s; TotalTimePerSample = 0.49478ms; SamplesPerSecond = 2021
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12142s; TotalTimePerSample = 0.48566ms; SamplesPerSecond = 2059
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11773s; TotalTimePerSample = 0.47092ms; SamplesPerSecond = 2123
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70129698; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12147s; TotalTimePerSample = 0.48588ms; SamplesPerSecond = 2058
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12122s; TotalTimePerSample = 0.48487ms; SamplesPerSecond = 2062
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.11718s; TotalTimePerSample = 0.46871ms; SamplesPerSecond = 2133
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69266187; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.11804s; TotalTimePerSample = 0.47215ms; SamplesPerSecond = 2117
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12116s; TotalTimePerSample = 0.48466ms; SamplesPerSecond = 2063
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69257409; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12555s; TotalTimePerSample = 0.50218ms; SamplesPerSecond = 1991
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12602s; TotalTimePerSample = 0.50407ms; SamplesPerSecond = 1983
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12332s; TotalTimePerSample = 0.49329ms; SamplesPerSecond = 2027
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70192154; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12449s; TotalTimePerSample = 0.49795ms; SamplesPerSecond = 2008
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12387s; TotalTimePerSample = 0.49546ms; SamplesPerSecond = 2018
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.67041492; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12148s; TotalTimePerSample = 0.48592ms; SamplesPerSecond = 2057
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.65913973; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12846s; TotalTimePerSample = 0.51382ms; SamplesPerSecond = 1946
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.63919877; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.14472s; TotalTimePerSample = 0.57890ms; SamplesPerSecond = 1727
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.61293883; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.14464s; TotalTimePerSample = 0.57857ms; SamplesPerSecond = 1728
+MPI Rank 1:  Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.55255352; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.14544s; TotalTimePerSample = 0.58177ms; SamplesPerSecond = 1718
+MPI Rank 1: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.368273
+MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 1: starting epoch 1 at record count 10000, and file position 0
+MPI Rank 1: already there from last epoch
+MPI Rank 1: 
+MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.50774625; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.13816s; TotalTimePerSample = 0.55263ms; SamplesPerSecond = 1809
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.43388927; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.13876s; TotalTimePerSample = 0.55503ms; SamplesPerSecond = 1801
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.36674870; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14014s; TotalTimePerSample = 0.56057ms; SamplesPerSecond = 1783
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.33768765; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.13764s; TotalTimePerSample = 0.55054ms; SamplesPerSecond = 1816
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.30320946; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12862s; TotalTimePerSample = 0.51449ms; SamplesPerSecond = 1943
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.29576043; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.14846s; TotalTimePerSample = 0.59382ms; SamplesPerSecond = 1684
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24924491; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14674s; TotalTimePerSample = 0.58698ms; SamplesPerSecond = 1703
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24632415; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14428s; TotalTimePerSample = 0.57712ms; SamplesPerSecond = 1732
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20943158; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13610s; TotalTimePerSample = 0.54438ms; SamplesPerSecond = 1836
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19115996; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13467s; TotalTimePerSample = 0.53866ms; SamplesPerSecond = 1856
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17923231; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13753s; TotalTimePerSample = 0.55014ms; SamplesPerSecond = 1817
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17075422; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13578s; TotalTimePerSample = 0.54313ms; SamplesPerSecond = 1841
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14442371; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.13113s; TotalTimePerSample = 0.52451ms; SamplesPerSecond = 1906
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17753819; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.13552s; TotalTimePerSample = 0.54206ms; SamplesPerSecond = 1844
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15087855; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13849s; TotalTimePerSample = 0.55396ms; SamplesPerSecond = 1805
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19253023; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13517s; TotalTimePerSample = 0.54068ms; SamplesPerSecond = 1849
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14493s; TotalTimePerSample = 0.57970ms; SamplesPerSecond = 1725
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.14346s; TotalTimePerSample = 0.57384ms; SamplesPerSecond = 1742
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14339s; TotalTimePerSample = 0.57354ms; SamplesPerSecond = 1743
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14541s; TotalTimePerSample = 0.58166ms; SamplesPerSecond = 1719
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13568s; TotalTimePerSample = 0.54271ms; SamplesPerSecond = 1842
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14328s; TotalTimePerSample = 0.57313ms; SamplesPerSecond = 1744
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14684504; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.14565s; TotalTimePerSample = 0.58262ms; SamplesPerSecond = 1716
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15322115; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14660s; TotalTimePerSample = 0.58641ms; SamplesPerSecond = 1705
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.14054s; TotalTimePerSample = 0.56215ms; SamplesPerSecond = 1778
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13683833; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14033s; TotalTimePerSample = 0.56131ms; SamplesPerSecond = 1781
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18621188; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13395s; TotalTimePerSample = 0.53582ms; SamplesPerSecond = 1866
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19408048; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13231s; TotalTimePerSample = 0.52925ms; SamplesPerSecond = 1889
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14681s; TotalTimePerSample = 0.58725ms; SamplesPerSecond = 1702
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.13163s; TotalTimePerSample = 0.52651ms; SamplesPerSecond = 1899
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.13127s; TotalTimePerSample = 0.52509ms; SamplesPerSecond = 1904
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12747s; TotalTimePerSample = 0.50988ms; SamplesPerSecond = 1961
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15108451; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12633s; TotalTimePerSample = 0.50532ms; SamplesPerSecond = 1978
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19729184; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12381s; TotalTimePerSample = 0.49523ms; SamplesPerSecond = 2019
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12857332; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12245s; TotalTimePerSample = 0.48981ms; SamplesPerSecond = 2041
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12077s; TotalTimePerSample = 0.48310ms; SamplesPerSecond = 2069
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12786050; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12252s; TotalTimePerSample = 0.49008ms; SamplesPerSecond = 2040
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12600s; TotalTimePerSample = 0.50401ms; SamplesPerSecond = 1984
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20440409; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12218s; TotalTimePerSample = 0.48874ms; SamplesPerSecond = 2046
+MPI Rank 1:  Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14566238; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12172s; TotalTimePerSample = 0.48689ms; SamplesPerSecond = 2053
+MPI Rank 1: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373029; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.454826
+MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 1: starting epoch 2 at record count 20000, and file position 0
+MPI Rank 1: already there from last epoch
+MPI Rank 1: 
+MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12007s; TotalTimePerSample = 0.48028ms; SamplesPerSecond = 2082
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17780229; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12506s; TotalTimePerSample = 0.50025ms; SamplesPerSecond = 1999
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12289s; TotalTimePerSample = 0.49155ms; SamplesPerSecond = 2034
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15796895; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12196s; TotalTimePerSample = 0.48784ms; SamplesPerSecond = 2049
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17002999; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12311s; TotalTimePerSample = 0.49244ms; SamplesPerSecond = 2030
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11534s; TotalTimePerSample = 0.46136ms; SamplesPerSecond = 2167
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14643694; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11576s; TotalTimePerSample = 0.46302ms; SamplesPerSecond = 2159
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12379s; TotalTimePerSample = 0.49516ms; SamplesPerSecond = 2019
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12204s; TotalTimePerSample = 0.48814ms; SamplesPerSecond = 2048
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12041s; TotalTimePerSample = 0.48164ms; SamplesPerSecond = 2076
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13469093; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11856s; TotalTimePerSample = 0.47423ms; SamplesPerSecond = 2108
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12682s; TotalTimePerSample = 0.50730ms; SamplesPerSecond = 1971
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11948s; TotalTimePerSample = 0.47792ms; SamplesPerSecond = 2092
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.11879s; TotalTimePerSample = 0.47514ms; SamplesPerSecond = 2104
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12407s; TotalTimePerSample = 0.49630ms; SamplesPerSecond = 2014
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.11792s; TotalTimePerSample = 0.47167ms; SamplesPerSecond = 2120
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12138s; TotalTimePerSample = 0.48552ms; SamplesPerSecond = 2059
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12145s; TotalTimePerSample = 0.48581ms; SamplesPerSecond = 2058
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19243443; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12131s; TotalTimePerSample = 0.48526ms; SamplesPerSecond = 2060
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12898s; TotalTimePerSample = 0.51590ms; SamplesPerSecond = 1938
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12863s; TotalTimePerSample = 0.51450ms; SamplesPerSecond = 1943
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18185535; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12226s; TotalTimePerSample = 0.48902ms; SamplesPerSecond = 2044
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14074205; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.13032s; TotalTimePerSample = 0.52128ms; SamplesPerSecond = 1918
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12011s; TotalTimePerSample = 0.48045ms; SamplesPerSecond = 2081
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20299704; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12255s; TotalTimePerSample = 0.49020ms; SamplesPerSecond = 2040
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12852038; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12023s; TotalTimePerSample = 0.48093ms; SamplesPerSecond = 2079
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18660439; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12387s; TotalTimePerSample = 0.49548ms; SamplesPerSecond = 2018
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12145s; TotalTimePerSample = 0.48581ms; SamplesPerSecond = 2058
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12442s; TotalTimePerSample = 0.49768ms; SamplesPerSecond = 2009
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12526169; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49888ms; SamplesPerSecond = 2004
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17392131; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12284s; TotalTimePerSample = 0.49135ms; SamplesPerSecond = 2035
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12305s; TotalTimePerSample = 0.49218ms; SamplesPerSecond = 2031
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.11614s; TotalTimePerSample = 0.46454ms; SamplesPerSecond = 2152
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12518s; TotalTimePerSample = 0.50072ms; SamplesPerSecond = 1997
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12264s; TotalTimePerSample = 0.49056ms; SamplesPerSecond = 2038
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12058s; TotalTimePerSample = 0.48234ms; SamplesPerSecond = 2073
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12910s; TotalTimePerSample = 0.51639ms; SamplesPerSecond = 1936
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16654369; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.13325s; TotalTimePerSample = 0.53300ms; SamplesPerSecond = 1876
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20658951; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12111s; TotalTimePerSample = 0.48445ms; SamplesPerSecond = 2064
+MPI Rank 1:  Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.11517s; TotalTimePerSample = 0.46067ms; SamplesPerSecond = 2170
+MPI Rank 1: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.926445
+MPI Rank 1: Starting Epoch 4: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 1: starting epoch 3 at record count 30000, and file position 0
+MPI Rank 1: already there from last epoch
+MPI Rank 1: 
+MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12371232; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12017s; TotalTimePerSample = 0.48068ms; SamplesPerSecond = 2080
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12035s; TotalTimePerSample = 0.48139ms; SamplesPerSecond = 2077
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14239730; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12719s; TotalTimePerSample = 0.50874ms; SamplesPerSecond = 1965
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.11979s; TotalTimePerSample = 0.47914ms; SamplesPerSecond = 2087
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11890s; TotalTimePerSample = 0.47562ms; SamplesPerSecond = 2102
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12220s; TotalTimePerSample = 0.48880ms; SamplesPerSecond = 2045
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12005s; TotalTimePerSample = 0.48020ms; SamplesPerSecond = 2082
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18021601; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11799s; TotalTimePerSample = 0.47197ms; SamplesPerSecond = 2118
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11827s; TotalTimePerSample = 0.47310ms; SamplesPerSecond = 2113
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14474425; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12205s; TotalTimePerSample = 0.48818ms; SamplesPerSecond = 2048
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11950s; TotalTimePerSample = 0.47799ms; SamplesPerSecond = 2092
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13708299; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12274s; TotalTimePerSample = 0.49095ms; SamplesPerSecond = 2036
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11569777; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12041s; TotalTimePerSample = 0.48166ms; SamplesPerSecond = 2076
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16892331; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12018s; TotalTimePerSample = 0.48073ms; SamplesPerSecond = 2080
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12752162; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.11985s; TotalTimePerSample = 0.47938ms; SamplesPerSecond = 2086
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12315s; TotalTimePerSample = 0.49262ms; SamplesPerSecond = 2029
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.11828s; TotalTimePerSample = 0.47312ms; SamplesPerSecond = 2113
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14105803; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12278s; TotalTimePerSample = 0.49111ms; SamplesPerSecond = 2036
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19333552; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12607s; TotalTimePerSample = 0.50429ms; SamplesPerSecond = 1982
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20859524; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14369s; TotalTimePerSample = 0.57476ms; SamplesPerSecond = 1739
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12830s; TotalTimePerSample = 0.51322ms; SamplesPerSecond = 1948
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12787s; TotalTimePerSample = 0.51148ms; SamplesPerSecond = 1955
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14037157; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11950s; TotalTimePerSample = 0.47798ms; SamplesPerSecond = 2092
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12108s; TotalTimePerSample = 0.48433ms; SamplesPerSecond = 2064
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20347747; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.11742s; TotalTimePerSample = 0.46968ms; SamplesPerSecond = 2129
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12815012; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12145s; TotalTimePerSample = 0.48580ms; SamplesPerSecond = 2058
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12065s; TotalTimePerSample = 0.48258ms; SamplesPerSecond = 2072
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19552990; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12248s; TotalTimePerSample = 0.48990ms; SamplesPerSecond = 2041
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12369s; TotalTimePerSample = 0.49474ms; SamplesPerSecond = 2021
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12129s; TotalTimePerSample = 0.48516ms; SamplesPerSecond = 2061
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12227s; TotalTimePerSample = 0.48908ms; SamplesPerSecond = 2044
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12242s; TotalTimePerSample = 0.48967ms; SamplesPerSecond = 2042
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14723334; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12409s; TotalTimePerSample = 0.49637ms; SamplesPerSecond = 2014
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12229s; TotalTimePerSample = 0.48915ms; SamplesPerSecond = 2044
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12467s; TotalTimePerSample = 0.49868ms; SamplesPerSecond = 2005
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12625s; TotalTimePerSample = 0.50498ms; SamplesPerSecond = 1980
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12036s; TotalTimePerSample = 0.48143ms; SamplesPerSecond = 2077
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12133s; TotalTimePerSample = 0.48530ms; SamplesPerSecond = 2060
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12414s; TotalTimePerSample = 0.49655ms; SamplesPerSecond = 2013
+MPI Rank 1:  Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12625s; TotalTimePerSample = 0.50498ms; SamplesPerSecond = 1980
+MPI Rank 1: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.931591
+MPI Rank 1: CNTKCommandTrainEnd: SimpleMultiGPU
+MPI Rank 1: COMPLETED
+MPI Rank 1: ~MPIWrapper
+MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank2
+MPI Rank 2: -------------------------------------------------------------------
+MPI Rank 2: Build info: 
+MPI Rank 2: 
+MPI Rank 2: 		Built time: Oct 24 2015 13:33:25
+MPI Rank 2: 		Last modified date: Thu Oct 22 16:00:27 2015
+MPI Rank 2: 		Built by amitaga on Amitaga-Win-DT3           
+MPI Rank 2: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
+MPI Rank 2: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+MPI Rank 2: -------------------------------------------------------------------
+MPI Rank 2: running on Amitaga-Win-DT3 at 2015/10/24 21:49:39
+MPI Rank 2: command line: 
+MPI Rank 2: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr 
+MPI Rank 2: 
+MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+MPI Rank 2: deviceId=$DeviceId$
+MPI Rank 2: command=SimpleMultiGPU
+MPI Rank 2: precision=float
+MPI Rank 2: parallelTrain=true
+MPI Rank 2: SimpleMultiGPU=[
+MPI Rank 2:     action=train
+MPI Rank 2:     modelPath=$RunDir$/models/Simple.dnn
+MPI Rank 2:     deviceId=$DeviceId$
+MPI Rank 2:     traceLevel=1
+MPI Rank 2:     SimpleNetworkBuilder=[
+MPI Rank 2:         layerSizes=2:50*2:2
+MPI Rank 2:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 2:         evalCriterion=ErrorPrediction
+MPI Rank 2:         layerTypes=Sigmoid
+MPI Rank 2:         initValueScale=1.0
+MPI Rank 2:         applyMeanVarNorm=true
+MPI Rank 2:         uniformInit=true
+MPI Rank 2:         needPrior=true
+MPI Rank 2:     ]
+MPI Rank 2:     SGD=[
+MPI Rank 2:         epochSize=0 
+MPI Rank 2:         minibatchSize=25
+MPI Rank 2:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 2:         momentumPerMB=0.9
+MPI Rank 2:         dropoutRate=0.0
+MPI Rank 2:         maxEpochs=4
+MPI Rank 2:         ParallelTrain=[
+MPI Rank 2:             parallelizationMethod=DataParallelSGD
+MPI Rank 2:             DataParallelSGD=[
+MPI Rank 2:               gradientBits=1
+MPI Rank 2:             ]
+MPI Rank 2:         ]
+MPI Rank 2:     ]
+MPI Rank 2:     reader=[
+MPI Rank 2:       readerType=UCIFastReader
+MPI Rank 2:       file=$DataDir$/SimpleDataTrain.txt
+MPI Rank 2:       miniBatchMode=Partial
+MPI Rank 2:       randomize=None
+MPI Rank 2:       verbosity=1   
+MPI Rank 2:       features=[
+MPI Rank 2: dim=2      
+MPI Rank 2: start=0    
+MPI Rank 2:       ]
+MPI Rank 2:       labels=[
+MPI Rank 2: start=2      
+MPI Rank 2: dim=1        
+MPI Rank 2: labelDim=2   
+MPI Rank 2:         labelMappingFile=$DataDir$/SimpleMapping.txt
+MPI Rank 2:       ]
+MPI Rank 2:     ]
+MPI Rank 2: ]
+MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 2: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 2: DeviceId=0
+MPI Rank 2: precision=float
+MPI Rank 2: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 2: 
+MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+MPI Rank 2: 
+MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+MPI Rank 2: deviceId=0
+MPI Rank 2: command=SimpleMultiGPU
+MPI Rank 2: precision=float
+MPI Rank 2: parallelTrain=true
+MPI Rank 2: SimpleMultiGPU=[
+MPI Rank 2:     action=train
+MPI Rank 2:     modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 2:     deviceId=0
+MPI Rank 2:     traceLevel=1
+MPI Rank 2:     SimpleNetworkBuilder=[
+MPI Rank 2:         layerSizes=2:50*2:2
+MPI Rank 2:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 2:         evalCriterion=ErrorPrediction
+MPI Rank 2:         layerTypes=Sigmoid
+MPI Rank 2:         initValueScale=1.0
+MPI Rank 2:         applyMeanVarNorm=true
+MPI Rank 2:         uniformInit=true
+MPI Rank 2:         needPrior=true
+MPI Rank 2:     ]
+MPI Rank 2:     SGD=[
+MPI Rank 2:         epochSize=0 
+MPI Rank 2:         minibatchSize=25
+MPI Rank 2:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 2:         momentumPerMB=0.9
+MPI Rank 2:         dropoutRate=0.0
+MPI Rank 2:         maxEpochs=4
+MPI Rank 2:         ParallelTrain=[
+MPI Rank 2:             parallelizationMethod=DataParallelSGD
+MPI Rank 2:             DataParallelSGD=[
+MPI Rank 2:               gradientBits=1
+MPI Rank 2:             ]
+MPI Rank 2:         ]
+MPI Rank 2:     ]
+MPI Rank 2:     reader=[
+MPI Rank 2:       readerType=UCIFastReader
+MPI Rank 2:       file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 2:       miniBatchMode=Partial
+MPI Rank 2:       randomize=None
+MPI Rank 2:       verbosity=1   
+MPI Rank 2:       features=[
+MPI Rank 2: dim=2      
+MPI Rank 2: start=0    
+MPI Rank 2:       ]
+MPI Rank 2:       labels=[
+MPI Rank 2: start=2      
+MPI Rank 2: dim=1        
+MPI Rank 2: labelDim=2   
+MPI Rank 2:         labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 2:       ]
+MPI Rank 2:     ]
+MPI Rank 2: ]
+MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 2: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 2: DeviceId=0
+MPI Rank 2: precision=float
+MPI Rank 2: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 2: 
+MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+MPI Rank 2: 
+MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+MPI Rank 2: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU
+MPI Rank 2: configparameters: SimpleMultiGPU.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 2: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 2: configparameters: SimpleMultiGPU.config:deviceId=0
+MPI Rank 2: configparameters: SimpleMultiGPU.config:parallelTrain=true
+MPI Rank 2: configparameters: SimpleMultiGPU.config:precision=float
+MPI Rank 2: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 2: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[
+MPI Rank 2:     action=train
+MPI Rank 2:     modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 2:     deviceId=0
+MPI Rank 2:     traceLevel=1
+MPI Rank 2:     SimpleNetworkBuilder=[
+MPI Rank 2:         layerSizes=2:50*2:2
+MPI Rank 2:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 2:         evalCriterion=ErrorPrediction
+MPI Rank 2:         layerTypes=Sigmoid
+MPI Rank 2:         initValueScale=1.0
+MPI Rank 2:         applyMeanVarNorm=true
+MPI Rank 2:         uniformInit=true
+MPI Rank 2:         needPrior=true
+MPI Rank 2:     ]
+MPI Rank 2:     SGD=[
+MPI Rank 2:         epochSize=0 
+MPI Rank 2:         minibatchSize=25
+MPI Rank 2:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 2:         momentumPerMB=0.9
+MPI Rank 2:         dropoutRate=0.0
+MPI Rank 2:         maxEpochs=4
+MPI Rank 2:         ParallelTrain=[
+MPI Rank 2:             parallelizationMethod=DataParallelSGD
+MPI Rank 2:             DataParallelSGD=[
+MPI Rank 2:               gradientBits=1
+MPI Rank 2:             ]
+MPI Rank 2:         ]
+MPI Rank 2:     ]
+MPI Rank 2:     reader=[
+MPI Rank 2:       readerType=UCIFastReader
+MPI Rank 2:       file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 2:       miniBatchMode=Partial
+MPI Rank 2:       randomize=None
+MPI Rank 2:       verbosity=1   
+MPI Rank 2:       features=[
+MPI Rank 2: dim=2      
+MPI Rank 2: start=0    
+MPI Rank 2:       ]
+MPI Rank 2:       labels=[
+MPI Rank 2: start=2      
+MPI Rank 2: dim=1        
+MPI Rank 2: labelDim=2   
+MPI Rank 2:         labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 2:       ]
+MPI Rank 2:     ]
+MPI Rank 2: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 2: 
+MPI Rank 2: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+MPI Rank 2: command: SimpleMultiGPU 
+MPI Rank 2: precision = float
+MPI Rank 2: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 2: CNTKCommandTrainInfo: SimpleMultiGPU : 4
+MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+MPI Rank 2: CNTKCommandTrainBegin: SimpleMultiGPU
+MPI Rank 2: SimpleNetworkBuilder Using GPU 0
+MPI Rank 2: reading uci file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1
+MPI Rank 2: GetTrainCriterionNodes  ...
+MPI Rank 2: GetEvalCriterionNodes  ...
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 2: 
+MPI Rank 2: 	NodeName: InvStdOfFeatures
+MPI Rank 2: 	NodeName: MeanOfFeatures
+MPI Rank 2: 	NodeName: Prior
+MPI Rank 2: starting at epoch 0 counting lines to determine record count
+MPI Rank 2: 
+MPI Rank 2:  10000 records found
+MPI Rank 2: starting epoch 0 at record count 0, and file position 0
+MPI Rank 2: already there from last epoch
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
+MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node InvStdOfFeatures, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
+MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node MeanOfFeatures, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Validating for node Prior. 2 nodes to process in pass 1.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node Prior, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 2: 
+MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Precomputing --> Completed.
+MPI Rank 2: 
+MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
+MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000 
+MPI Rank 2: starting epoch 0 at record count 0, and file position 0
+MPI Rank 2: already there from last epoch
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 2: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 2: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal.
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[   1-  10 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.21351s; TotalTimePerSample = 0.85404ms; SamplesPerSecond = 1170
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  11-  20 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.17145s; TotalTimePerSample = 0.68580ms; SamplesPerSecond = 1458
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  21-  30 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.16870s; TotalTimePerSample = 0.67481ms; SamplesPerSecond = 1481
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  31-  40 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.16017s; TotalTimePerSample = 0.64067ms; SamplesPerSecond = 1560
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  41-  50 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70605615; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.16272s; TotalTimePerSample = 0.65088ms; SamplesPerSecond = 1536
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  51-  60 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.15165s; TotalTimePerSample = 0.60661ms; SamplesPerSecond = 1648
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  61-  70 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.14150s; TotalTimePerSample = 0.56598ms; SamplesPerSecond = 1766
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  71-  80 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.79845604; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.14222s; TotalTimePerSample = 0.56890ms; SamplesPerSecond = 1757
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  81-  90 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.13869s; TotalTimePerSample = 0.55474ms; SamplesPerSecond = 1802
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[  91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12734s; TotalTimePerSample = 0.50937ms; SamplesPerSecond = 1963
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71420345; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.13083s; TotalTimePerSample = 0.52330ms; SamplesPerSecond = 1910
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12776s; TotalTimePerSample = 0.51105ms; SamplesPerSecond = 1956
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12404s; TotalTimePerSample = 0.49614ms; SamplesPerSecond = 2015
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71857916; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.11920s; TotalTimePerSample = 0.47680ms; SamplesPerSecond = 2097
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.11919s; TotalTimePerSample = 0.47676ms; SamplesPerSecond = 2097
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71798840; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.11688s; TotalTimePerSample = 0.46752ms; SamplesPerSecond = 2138
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11731s; TotalTimePerSample = 0.46925ms; SamplesPerSecond = 2131
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71835128; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.11878s; TotalTimePerSample = 0.47512ms; SamplesPerSecond = 2104
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71529463; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12494s; TotalTimePerSample = 0.49977ms; SamplesPerSecond = 2000
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71727657; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.11999s; TotalTimePerSample = 0.47998ms; SamplesPerSecond = 2083
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11901s; TotalTimePerSample = 0.47604ms; SamplesPerSecond = 2100
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088398; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11499s; TotalTimePerSample = 0.45995ms; SamplesPerSecond = 2174
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72006809; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12383s; TotalTimePerSample = 0.49534ms; SamplesPerSecond = 2018
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12128s; TotalTimePerSample = 0.48511ms; SamplesPerSecond = 2061
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11778s; TotalTimePerSample = 0.47112ms; SamplesPerSecond = 2122
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70129698; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12175s; TotalTimePerSample = 0.48699ms; SamplesPerSecond = 2053
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12129s; TotalTimePerSample = 0.48515ms; SamplesPerSecond = 2061
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.11708s; TotalTimePerSample = 0.46833ms; SamplesPerSecond = 2135
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69266187; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.11805s; TotalTimePerSample = 0.47221ms; SamplesPerSecond = 2117
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12085s; TotalTimePerSample = 0.48341ms; SamplesPerSecond = 2068
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69257409; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12586s; TotalTimePerSample = 0.50342ms; SamplesPerSecond = 1986
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12600s; TotalTimePerSample = 0.50399ms; SamplesPerSecond = 1984
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12351s; TotalTimePerSample = 0.49405ms; SamplesPerSecond = 2024
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70192154; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12385s; TotalTimePerSample = 0.49541ms; SamplesPerSecond = 2018
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12401s; TotalTimePerSample = 0.49606ms; SamplesPerSecond = 2015
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.67041492; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12184s; TotalTimePerSample = 0.48736ms; SamplesPerSecond = 2051
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.65913973; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12859s; TotalTimePerSample = 0.51435ms; SamplesPerSecond = 1944
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.63919877; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.14455s; TotalTimePerSample = 0.57820ms; SamplesPerSecond = 1729
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.61293883; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.14490s; TotalTimePerSample = 0.57959ms; SamplesPerSecond = 1725
+MPI Rank 2:  Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.55255352; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.14556s; TotalTimePerSample = 0.58222ms; SamplesPerSecond = 1717
+MPI Rank 2: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.368498
+MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 2: starting epoch 1 at record count 10000, and file position 0
+MPI Rank 2: already there from last epoch
+MPI Rank 2: 
+MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.50774625; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.13794s; TotalTimePerSample = 0.55177ms; SamplesPerSecond = 1812
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.43388927; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.13922s; TotalTimePerSample = 0.55688ms; SamplesPerSecond = 1795
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.36674870; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14013s; TotalTimePerSample = 0.56053ms; SamplesPerSecond = 1784
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.33768765; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.13721s; TotalTimePerSample = 0.54884ms; SamplesPerSecond = 1822
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.30320946; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12919s; TotalTimePerSample = 0.51676ms; SamplesPerSecond = 1935
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.29576043; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.14802s; TotalTimePerSample = 0.59206ms; SamplesPerSecond = 1689
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24924491; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14685s; TotalTimePerSample = 0.58740ms; SamplesPerSecond = 1702
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24632415; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14440s; TotalTimePerSample = 0.57762ms; SamplesPerSecond = 1731
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20943158; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13614s; TotalTimePerSample = 0.54457ms; SamplesPerSecond = 1836
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19115996; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13503s; TotalTimePerSample = 0.54011ms; SamplesPerSecond = 1851
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17923231; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13615s; TotalTimePerSample = 0.54460ms; SamplesPerSecond = 1836
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17075422; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13632s; TotalTimePerSample = 0.54526ms; SamplesPerSecond = 1833
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14442371; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.13087s; TotalTimePerSample = 0.52350ms; SamplesPerSecond = 1910
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17753819; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.13640s; TotalTimePerSample = 0.54560ms; SamplesPerSecond = 1832
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15087855; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13907s; TotalTimePerSample = 0.55627ms; SamplesPerSecond = 1797
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19253023; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13523s; TotalTimePerSample = 0.54090ms; SamplesPerSecond = 1848
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14474s; TotalTimePerSample = 0.57897ms; SamplesPerSecond = 1727
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.14352s; TotalTimePerSample = 0.57407ms; SamplesPerSecond = 1741
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14338s; TotalTimePerSample = 0.57352ms; SamplesPerSecond = 1743
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14566s; TotalTimePerSample = 0.58264ms; SamplesPerSecond = 1716
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13549s; TotalTimePerSample = 0.54194ms; SamplesPerSecond = 1845
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14315s; TotalTimePerSample = 0.57262ms; SamplesPerSecond = 1746
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14684504; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.14583s; TotalTimePerSample = 0.58333ms; SamplesPerSecond = 1714
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15322115; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14638s; TotalTimePerSample = 0.58552ms; SamplesPerSecond = 1707
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.14054s; TotalTimePerSample = 0.56214ms; SamplesPerSecond = 1778
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13683833; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14019s; TotalTimePerSample = 0.56076ms; SamplesPerSecond = 1783
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18621188; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13443s; TotalTimePerSample = 0.53771ms; SamplesPerSecond = 1859
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19408048; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13228s; TotalTimePerSample = 0.52910ms; SamplesPerSecond = 1890
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14559s; TotalTimePerSample = 0.58236ms; SamplesPerSecond = 1717
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.13309s; TotalTimePerSample = 0.53238ms; SamplesPerSecond = 1878
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.13118s; TotalTimePerSample = 0.52470ms; SamplesPerSecond = 1905
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12731s; TotalTimePerSample = 0.50923ms; SamplesPerSecond = 1963
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15108451; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12619s; TotalTimePerSample = 0.50474ms; SamplesPerSecond = 1981
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19729184; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12375s; TotalTimePerSample = 0.49498ms; SamplesPerSecond = 2020
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12857332; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12239s; TotalTimePerSample = 0.48956ms; SamplesPerSecond = 2042
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12138s; TotalTimePerSample = 0.48554ms; SamplesPerSecond = 2059
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12786050; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12245s; TotalTimePerSample = 0.48981ms; SamplesPerSecond = 2041
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12604s; TotalTimePerSample = 0.50418ms; SamplesPerSecond = 1983
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20440409; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12258s; TotalTimePerSample = 0.49033ms; SamplesPerSecond = 2039
+MPI Rank 2:  Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14566238; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12077s; TotalTimePerSample = 0.48308ms; SamplesPerSecond = 2070
+MPI Rank 2: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373029; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.454115
+MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 2: starting epoch 2 at record count 20000, and file position 0
+MPI Rank 2: already there from last epoch
+MPI Rank 2: 
+MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12007s; TotalTimePerSample = 0.48030ms; SamplesPerSecond = 2082
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17780229; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12502s; TotalTimePerSample = 0.50009ms; SamplesPerSecond = 1999
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12269s; TotalTimePerSample = 0.49075ms; SamplesPerSecond = 2037
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15796895; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12210s; TotalTimePerSample = 0.48842ms; SamplesPerSecond = 2047
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17002999; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12374s; TotalTimePerSample = 0.49496ms; SamplesPerSecond = 2020
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11485s; TotalTimePerSample = 0.45938ms; SamplesPerSecond = 2176
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14643694; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11641s; TotalTimePerSample = 0.46564ms; SamplesPerSecond = 2147
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12303s; TotalTimePerSample = 0.49210ms; SamplesPerSecond = 2032
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12244s; TotalTimePerSample = 0.48977ms; SamplesPerSecond = 2041
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12013s; TotalTimePerSample = 0.48050ms; SamplesPerSecond = 2081
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13469093; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11843s; TotalTimePerSample = 0.47373ms; SamplesPerSecond = 2110
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12696s; TotalTimePerSample = 0.50785ms; SamplesPerSecond = 1969
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11963s; TotalTimePerSample = 0.47853ms; SamplesPerSecond = 2089
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.11897s; TotalTimePerSample = 0.47587ms; SamplesPerSecond = 2101
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12430s; TotalTimePerSample = 0.49719ms; SamplesPerSecond = 2011
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.11756s; TotalTimePerSample = 0.47023ms; SamplesPerSecond = 2126
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12107s; TotalTimePerSample = 0.48428ms; SamplesPerSecond = 2064
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12174s; TotalTimePerSample = 0.48696ms; SamplesPerSecond = 2053
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19243443; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12139s; TotalTimePerSample = 0.48555ms; SamplesPerSecond = 2059
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12892s; TotalTimePerSample = 0.51567ms; SamplesPerSecond = 1939
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12850s; TotalTimePerSample = 0.51400ms; SamplesPerSecond = 1945
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18185535; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12240s; TotalTimePerSample = 0.48959ms; SamplesPerSecond = 2042
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14074205; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.13033s; TotalTimePerSample = 0.52130ms; SamplesPerSecond = 1918
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11995s; TotalTimePerSample = 0.47979ms; SamplesPerSecond = 2084
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20299704; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12241s; TotalTimePerSample = 0.48962ms; SamplesPerSecond = 2042
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12852038; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12054s; TotalTimePerSample = 0.48216ms; SamplesPerSecond = 2074
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18660439; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12419s; TotalTimePerSample = 0.49676ms; SamplesPerSecond = 2013
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12071s; TotalTimePerSample = 0.48283ms; SamplesPerSecond = 2071
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12485s; TotalTimePerSample = 0.49938ms; SamplesPerSecond = 2002
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12526169; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12473s; TotalTimePerSample = 0.49894ms; SamplesPerSecond = 2004
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17392131; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12339s; TotalTimePerSample = 0.49358ms; SamplesPerSecond = 2026
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12287s; TotalTimePerSample = 0.49147ms; SamplesPerSecond = 2034
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.11621s; TotalTimePerSample = 0.46485ms; SamplesPerSecond = 2151
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12491s; TotalTimePerSample = 0.49964ms; SamplesPerSecond = 2001
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12309s; TotalTimePerSample = 0.49238ms; SamplesPerSecond = 2030
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12019s; TotalTimePerSample = 0.48077ms; SamplesPerSecond = 2080
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12843s; TotalTimePerSample = 0.51372ms; SamplesPerSecond = 1946
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16654369; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.13408s; TotalTimePerSample = 0.53631ms; SamplesPerSecond = 1864
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20658951; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12132s; TotalTimePerSample = 0.48530ms; SamplesPerSecond = 2060
+MPI Rank 2:  Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.11488s; TotalTimePerSample = 0.45952ms; SamplesPerSecond = 2176
+MPI Rank 2: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.92626
+MPI Rank 2: Starting Epoch 4: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 2: starting epoch 3 at record count 30000, and file position 0
+MPI Rank 2: already there from last epoch
+MPI Rank 2: 
+MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12371232; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12031s; TotalTimePerSample = 0.48123ms; SamplesPerSecond = 2078
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12045s; TotalTimePerSample = 0.48179ms; SamplesPerSecond = 2075
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14239730; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12730s; TotalTimePerSample = 0.50922ms; SamplesPerSecond = 1963
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.11938s; TotalTimePerSample = 0.47754ms; SamplesPerSecond = 2094
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11927s; TotalTimePerSample = 0.47708ms; SamplesPerSecond = 2096
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12177s; TotalTimePerSample = 0.48708ms; SamplesPerSecond = 2053
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12015s; TotalTimePerSample = 0.48060ms; SamplesPerSecond = 2080
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18021601; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11793s; TotalTimePerSample = 0.47171ms; SamplesPerSecond = 2119
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11865s; TotalTimePerSample = 0.47460ms; SamplesPerSecond = 2107
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14474425; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12154s; TotalTimePerSample = 0.48615ms; SamplesPerSecond = 2056
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11991s; TotalTimePerSample = 0.47965ms; SamplesPerSecond = 2084
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13708299; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12222s; TotalTimePerSample = 0.48887ms; SamplesPerSecond = 2045
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11569777; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12080s; TotalTimePerSample = 0.48321ms; SamplesPerSecond = 2069
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16892331; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12001s; TotalTimePerSample = 0.48002ms; SamplesPerSecond = 2083
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12752162; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12061s; TotalTimePerSample = 0.48244ms; SamplesPerSecond = 2072
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12240s; TotalTimePerSample = 0.48961ms; SamplesPerSecond = 2042
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12004s; TotalTimePerSample = 0.48016ms; SamplesPerSecond = 2082
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14105803; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12205s; TotalTimePerSample = 0.48818ms; SamplesPerSecond = 2048
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19333552; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12591s; TotalTimePerSample = 0.50365ms; SamplesPerSecond = 1985
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20859524; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14391s; TotalTimePerSample = 0.57564ms; SamplesPerSecond = 1737
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12836s; TotalTimePerSample = 0.51342ms; SamplesPerSecond = 1947
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12739s; TotalTimePerSample = 0.50957ms; SamplesPerSecond = 1962
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14037157; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11992s; TotalTimePerSample = 0.47966ms; SamplesPerSecond = 2084
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12109s; TotalTimePerSample = 0.48438ms; SamplesPerSecond = 2064
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20347747; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.11737s; TotalTimePerSample = 0.46947ms; SamplesPerSecond = 2130
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12815012; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12143s; TotalTimePerSample = 0.48571ms; SamplesPerSecond = 2058
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12074s; TotalTimePerSample = 0.48296ms; SamplesPerSecond = 2070
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19552990; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12188s; TotalTimePerSample = 0.48751ms; SamplesPerSecond = 2051
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12390s; TotalTimePerSample = 0.49558ms; SamplesPerSecond = 2017
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12141s; TotalTimePerSample = 0.48563ms; SamplesPerSecond = 2059
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12232s; TotalTimePerSample = 0.48929ms; SamplesPerSecond = 2043
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12247s; TotalTimePerSample = 0.48987ms; SamplesPerSecond = 2041
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14723334; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12365s; TotalTimePerSample = 0.49459ms; SamplesPerSecond = 2021
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12271s; TotalTimePerSample = 0.49083ms; SamplesPerSecond = 2037
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12486s; TotalTimePerSample = 0.49944ms; SamplesPerSecond = 2002
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12591s; TotalTimePerSample = 0.50363ms; SamplesPerSecond = 1985
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12015s; TotalTimePerSample = 0.48060ms; SamplesPerSecond = 2080
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12200s; TotalTimePerSample = 0.48798ms; SamplesPerSecond = 2049
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12384s; TotalTimePerSample = 0.49536ms; SamplesPerSecond = 2018
+MPI Rank 2:  Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12606s; TotalTimePerSample = 0.50426ms; SamplesPerSecond = 1983
+MPI Rank 2: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.931381
+MPI Rank 2: CNTKCommandTrainEnd: SimpleMultiGPU
+MPI Rank 2: COMPLETED
+MPI Rank 2: ~MPIWrapper
+MPI Rank 3: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr_SimpleMultiGPU.logrank3
+MPI Rank 3: -------------------------------------------------------------------
+MPI Rank 3: Build info: 
+MPI Rank 3: 
+MPI Rank 3: 		Built time: Oct 24 2015 13:33:25
+MPI Rank 3: 		Last modified date: Thu Oct 22 16:00:27 2015
+MPI Rank 3: 		Built by amitaga on Amitaga-Win-DT3           
+MPI Rank 3: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
+MPI Rank 3: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+MPI Rank 3: -------------------------------------------------------------------
+MPI Rank 3: running on Amitaga-Win-DT3 at 2015/10/24 21:49:39
+MPI Rank 3: command line: 
+MPI Rank 3: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining/SimpleMultiGPU.config RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining DeviceId=0 precision=float SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]] stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr 
+MPI Rank 3: 
+MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+MPI Rank 3: deviceId=$DeviceId$
+MPI Rank 3: command=SimpleMultiGPU
+MPI Rank 3: precision=float
+MPI Rank 3: parallelTrain=true
+MPI Rank 3: SimpleMultiGPU=[
+MPI Rank 3:     action=train
+MPI Rank 3:     modelPath=$RunDir$/models/Simple.dnn
+MPI Rank 3:     deviceId=$DeviceId$
+MPI Rank 3:     traceLevel=1
+MPI Rank 3:     SimpleNetworkBuilder=[
+MPI Rank 3:         layerSizes=2:50*2:2
+MPI Rank 3:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 3:         evalCriterion=ErrorPrediction
+MPI Rank 3:         layerTypes=Sigmoid
+MPI Rank 3:         initValueScale=1.0
+MPI Rank 3:         applyMeanVarNorm=true
+MPI Rank 3:         uniformInit=true
+MPI Rank 3:         needPrior=true
+MPI Rank 3:     ]
+MPI Rank 3:     SGD=[
+MPI Rank 3:         epochSize=0 
+MPI Rank 3:         minibatchSize=25
+MPI Rank 3:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 3:         momentumPerMB=0.9
+MPI Rank 3:         dropoutRate=0.0
+MPI Rank 3:         maxEpochs=4
+MPI Rank 3:         ParallelTrain=[
+MPI Rank 3:             parallelizationMethod=DataParallelSGD
+MPI Rank 3:             DataParallelSGD=[
+MPI Rank 3:               gradientBits=1
+MPI Rank 3:             ]
+MPI Rank 3:         ]
+MPI Rank 3:     ]
+MPI Rank 3:     reader=[
+MPI Rank 3:       readerType=UCIFastReader
+MPI Rank 3:       file=$DataDir$/SimpleDataTrain.txt
+MPI Rank 3:       miniBatchMode=Partial
+MPI Rank 3:       randomize=None
+MPI Rank 3:       verbosity=1   
+MPI Rank 3:       features=[
+MPI Rank 3: dim=2      
+MPI Rank 3: start=0    
+MPI Rank 3:       ]
+MPI Rank 3:       labels=[
+MPI Rank 3: start=2      
+MPI Rank 3: dim=1        
+MPI Rank 3: labelDim=2   
+MPI Rank 3:         labelMappingFile=$DataDir$/SimpleMapping.txt
+MPI Rank 3:       ]
+MPI Rank 3:     ]
+MPI Rank 3: ]
+MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 3: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 3: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 3: DeviceId=0
+MPI Rank 3: precision=float
+MPI Rank 3: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 3: 
+MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+MPI Rank 3: 
+MPI Rank 3: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+MPI Rank 3: deviceId=0
+MPI Rank 3: command=SimpleMultiGPU
+MPI Rank 3: precision=float
+MPI Rank 3: parallelTrain=true
+MPI Rank 3: SimpleMultiGPU=[
+MPI Rank 3:     action=train
+MPI Rank 3:     modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 3:     deviceId=0
+MPI Rank 3:     traceLevel=1
+MPI Rank 3:     SimpleNetworkBuilder=[
+MPI Rank 3:         layerSizes=2:50*2:2
+MPI Rank 3:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 3:         evalCriterion=ErrorPrediction
+MPI Rank 3:         layerTypes=Sigmoid
+MPI Rank 3:         initValueScale=1.0
+MPI Rank 3:         applyMeanVarNorm=true
+MPI Rank 3:         uniformInit=true
+MPI Rank 3:         needPrior=true
+MPI Rank 3:     ]
+MPI Rank 3:     SGD=[
+MPI Rank 3:         epochSize=0 
+MPI Rank 3:         minibatchSize=25
+MPI Rank 3:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 3:         momentumPerMB=0.9
+MPI Rank 3:         dropoutRate=0.0
+MPI Rank 3:         maxEpochs=4
+MPI Rank 3:         ParallelTrain=[
+MPI Rank 3:             parallelizationMethod=DataParallelSGD
+MPI Rank 3:             DataParallelSGD=[
+MPI Rank 3:               gradientBits=1
+MPI Rank 3:             ]
+MPI Rank 3:         ]
+MPI Rank 3:     ]
+MPI Rank 3:     reader=[
+MPI Rank 3:       readerType=UCIFastReader
+MPI Rank 3:       file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 3:       miniBatchMode=Partial
+MPI Rank 3:       randomize=None
+MPI Rank 3:       verbosity=1   
+MPI Rank 3:       features=[
+MPI Rank 3: dim=2      
+MPI Rank 3: start=0    
+MPI Rank 3:       ]
+MPI Rank 3:       labels=[
+MPI Rank 3: start=2      
+MPI Rank 3: dim=1        
+MPI Rank 3: labelDim=2   
+MPI Rank 3:         labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 3:       ]
+MPI Rank 3:     ]
+MPI Rank 3: ]
+MPI Rank 3: RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 3: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 3: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 3: DeviceId=0
+MPI Rank 3: precision=float
+MPI Rank 3: SimpleMultiGPU=[SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 3: stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 3: 
+MPI Rank 3: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+MPI Rank 3: 
+MPI Rank 3: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+MPI Rank 3: configparameters: SimpleMultiGPU.config:command=SimpleMultiGPU
+MPI Rank 3: configparameters: SimpleMultiGPU.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining
+MPI Rank 3: configparameters: SimpleMultiGPU.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data
+MPI Rank 3: configparameters: SimpleMultiGPU.config:deviceId=0
+MPI Rank 3: configparameters: SimpleMultiGPU.config:parallelTrain=true
+MPI Rank 3: configparameters: SimpleMultiGPU.config:precision=float
+MPI Rank 3: configparameters: SimpleMultiGPU.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu
+MPI Rank 3: configparameters: SimpleMultiGPU.config:SimpleMultiGPU=[
+MPI Rank 3:     action=train
+MPI Rank 3:     modelPath=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 3:     deviceId=0
+MPI Rank 3:     traceLevel=1
+MPI Rank 3:     SimpleNetworkBuilder=[
+MPI Rank 3:         layerSizes=2:50*2:2
+MPI Rank 3:         trainingCriterion=CrossEntropyWithSoftmax
+MPI Rank 3:         evalCriterion=ErrorPrediction
+MPI Rank 3:         layerTypes=Sigmoid
+MPI Rank 3:         initValueScale=1.0
+MPI Rank 3:         applyMeanVarNorm=true
+MPI Rank 3:         uniformInit=true
+MPI Rank 3:         needPrior=true
+MPI Rank 3:     ]
+MPI Rank 3:     SGD=[
+MPI Rank 3:         epochSize=0 
+MPI Rank 3:         minibatchSize=25
+MPI Rank 3:         learningRatesPerMB=0.5:0.2*20:0.1
+MPI Rank 3:         momentumPerMB=0.9
+MPI Rank 3:         dropoutRate=0.0
+MPI Rank 3:         maxEpochs=4
+MPI Rank 3:         ParallelTrain=[
+MPI Rank 3:             parallelizationMethod=DataParallelSGD
+MPI Rank 3:             DataParallelSGD=[
+MPI Rank 3:               gradientBits=1
+MPI Rank 3:             ]
+MPI Rank 3:         ]
+MPI Rank 3:     ]
+MPI Rank 3:     reader=[
+MPI Rank 3:       readerType=UCIFastReader
+MPI Rank 3:       file=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 3:       miniBatchMode=Partial
+MPI Rank 3:       randomize=None
+MPI Rank 3:       verbosity=1   
+MPI Rank 3:       features=[
+MPI Rank 3: dim=2      
+MPI Rank 3: start=0    
+MPI Rank 3:       ]
+MPI Rank 3:       labels=[
+MPI Rank 3: start=2      
+MPI Rank 3: dim=1        
+MPI Rank 3: labelDim=2   
+MPI Rank 3:         labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleMapping.txt
+MPI Rank 3:       ]
+MPI Rank 3:     ]
+MPI Rank 3: ] [SGD=[ParallelTrain=[DataParallelSGD=[gradientBits=32]]]]
+MPI Rank 3: 
+MPI Rank 3: configparameters: SimpleMultiGPU.config:stderr=C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/stderr
+MPI Rank 3: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+MPI Rank 3: command: SimpleMultiGPU 
+MPI Rank 3: precision = float
+MPI Rank 3: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024134937.188247\ParallelTraining\NoQuantization_SinglePrecision@debug_gpu/models/Simple.dnn
+MPI Rank 3: CNTKCommandTrainInfo: SimpleMultiGPU : 4
+MPI Rank 3: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+MPI Rank 3: CNTKCommandTrainBegin: SimpleMultiGPU
+MPI Rank 3: SimpleNetworkBuilder Using GPU 0
+MPI Rank 3: reading uci file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\ParallelTraining\Data/SimpleDataTrain.txt
+MPI Rank 3: SetUniformRandomValue (GPU): creating curand object with seed 1
+MPI Rank 3: GetTrainCriterionNodes  ...
+MPI Rank 3: GetEvalCriterionNodes  ...
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 3], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 3], B0[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 3], B1[50, 1]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 3]) -> [50, MBSize 3]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 3]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 3], B2[2, 1]) -> [2, MBSize 3]
+MPI Rank 3: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[2, MBSize 3], HLast[2, MBSize 3]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 3: 
+MPI Rank 3: 	NodeName: InvStdOfFeatures
+MPI Rank 3: 	NodeName: MeanOfFeatures
+MPI Rank 3: 	NodeName: Prior
+MPI Rank 3: starting at epoch 0 counting lines to determine record count
+MPI Rank 3: 
+MPI Rank 3:  10000 records found
+MPI Rank 3: starting epoch 0 at record count 0, and file position 0
+MPI Rank 3: already there from last epoch
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
+MPI Rank 3: 
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node InvStdOfFeatures, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
+MPI Rank 3: 
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node MeanOfFeatures, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Validating for node Prior. 2 nodes to process in pass 1.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node Prior, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 3]
+MPI Rank 3: Validating --> Prior = Mean(labels[2, MBSize 3]) -> [2, 1]
+MPI Rank 3: 
+MPI Rank 3: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Precomputing --> Completed.
+MPI Rank 3: 
+MPI Rank 3: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
+MPI Rank 3: Starting Epoch 1: learning rate per sample = 0.020000  effective momentum = 0.900000 
+MPI Rank 3: starting epoch 0 at record count 0, and file position 0
+MPI Rank 3: already there from last epoch
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 3: 
+MPI Rank 3: Validating --> labels = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> W2 = LearnableParameter -> [2, 50]
+MPI Rank 3: Validating --> W1 = LearnableParameter -> [50, 50]
+MPI Rank 3: Validating --> W0 = LearnableParameter -> [50, 2]
+MPI Rank 3: Validating --> features = InputValue -> [2, MBSize 25]
+MPI Rank 3: Validating --> MeanOfFeatures = Mean(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> InvStdOfFeatures = InvStdDev(features[2, MBSize 25]) -> [2, 1]
+MPI Rank 3: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[2, MBSize 25], MeanOfFeatures[2, 1], InvStdOfFeatures[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> W0*features = Times(W0[50, 2], MVNormalizedFeatures[2, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B0 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W0*features+B0 = Plus(W0*features[50, MBSize 25], B0[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H1 = Sigmoid(W0*features+B0[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W1*H1 = Times(W1[50, 50], H1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> B1 = LearnableParameter -> [50, 1]
+MPI Rank 3: Validating --> W1*H1+B1 = Plus(W1*H1[50, MBSize 25], B1[50, 1]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> H2 = Sigmoid(W1*H1+B1[50, MBSize 25]) -> [50, MBSize 25]
+MPI Rank 3: Validating --> W2*H1 = Times(W2[2, 50], H2[50, MBSize 25]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> B2 = LearnableParameter -> [2, 1]
+MPI Rank 3: Validating --> HLast = Plus(W2*H1[2, MBSize 25], B2[2, 1]) -> [2, MBSize 25]
+MPI Rank 3: Validating --> EvalErrorPrediction = ErrorPrediction(labels[2, MBSize 25], HLast[2, MBSize 25]) -> [1, 1]
+MPI Rank 3: 
+MPI Rank 3: 9 out of 20 nodes do not share the minibatch layout with the input data.
+MPI Rank 3: 
+MPI Rank 3: 
+MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 3: DecimateMinibatchSequences: WARNING: Number of parallel utterances 25 not a multiple of number of GPUs 4, GPU usage will be suboptimal.
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[   1-  10 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70007977; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.21100s; TotalTimePerSample = 0.84399ms; SamplesPerSecond = 1184
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  11-  20 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71514542; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.17235s; TotalTimePerSample = 0.68940ms; SamplesPerSecond = 1450
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  21-  30 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72945595; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.16873s; TotalTimePerSample = 0.67493ms; SamplesPerSecond = 1481
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  31-  40 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70079058; EvalErr[0]PerSample = 0.52400000; TotalTime = 0.16075s; TotalTimePerSample = 0.64300ms; SamplesPerSecond = 1555
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  41-  50 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70605615; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.16197s; TotalTimePerSample = 0.64788ms; SamplesPerSecond = 1543
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  51-  60 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71572398; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.15188s; TotalTimePerSample = 0.60751ms; SamplesPerSecond = 1646
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  61-  70 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72149851; EvalErr[0]PerSample = 0.48000000; TotalTime = 0.14181s; TotalTimePerSample = 0.56722ms; SamplesPerSecond = 1762
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  71-  80 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.79845604; EvalErr[0]PerSample = 0.47600000; TotalTime = 0.14178s; TotalTimePerSample = 0.56714ms; SamplesPerSecond = 1763
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  81-  90 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69665186; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.13817s; TotalTimePerSample = 0.55266ms; SamplesPerSecond = 1809
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[  91- 100 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70723326; EvalErr[0]PerSample = 0.49200000; TotalTime = 0.12779s; TotalTimePerSample = 0.51114ms; SamplesPerSecond = 1956
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 101- 110 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71420345; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.13020s; TotalTimePerSample = 0.52080ms; SamplesPerSecond = 1920
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 111- 120 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69535258; EvalErr[0]PerSample = 0.43600000; TotalTime = 0.12825s; TotalTimePerSample = 0.51300ms; SamplesPerSecond = 1949
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 121- 130 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70078531; EvalErr[0]PerSample = 0.44000000; TotalTime = 0.12355s; TotalTimePerSample = 0.49419ms; SamplesPerSecond = 2023
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 131- 140 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71857916; EvalErr[0]PerSample = 0.54800000; TotalTime = 0.11948s; TotalTimePerSample = 0.47792ms; SamplesPerSecond = 2092
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 141- 150 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088358; EvalErr[0]PerSample = 0.48800000; TotalTime = 0.11900s; TotalTimePerSample = 0.47600ms; SamplesPerSecond = 2100
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 151- 160 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71798840; EvalErr[0]PerSample = 0.55200000; TotalTime = 0.11648s; TotalTimePerSample = 0.46593ms; SamplesPerSecond = 2146
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 161- 170 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.74162164; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11763s; TotalTimePerSample = 0.47053ms; SamplesPerSecond = 2125
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 171- 180 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71835128; EvalErr[0]PerSample = 0.51600000; TotalTime = 0.11827s; TotalTimePerSample = 0.47306ms; SamplesPerSecond = 2113
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 181- 190 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71529463; EvalErr[0]PerSample = 0.48400000; TotalTime = 0.12530s; TotalTimePerSample = 0.50120ms; SamplesPerSecond = 1995
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 191- 200 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71727657; EvalErr[0]PerSample = 0.53200000; TotalTime = 0.12000s; TotalTimePerSample = 0.48001ms; SamplesPerSecond = 2083
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 201- 210 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71745517; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11894s; TotalTimePerSample = 0.47577ms; SamplesPerSecond = 2101
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 211- 220 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72088398; EvalErr[0]PerSample = 0.50000000; TotalTime = 0.11488s; TotalTimePerSample = 0.45951ms; SamplesPerSecond = 2176
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 221- 230 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.72006809; EvalErr[0]PerSample = 0.50800000; TotalTime = 0.12400s; TotalTimePerSample = 0.49601ms; SamplesPerSecond = 2016
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 231- 240 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.71275468; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12093s; TotalTimePerSample = 0.48372ms; SamplesPerSecond = 2067
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 241- 250 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69644781; EvalErr[0]PerSample = 0.50400000; TotalTime = 0.11801s; TotalTimePerSample = 0.47202ms; SamplesPerSecond = 2118
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 251- 260 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70129698; EvalErr[0]PerSample = 0.51200000; TotalTime = 0.12166s; TotalTimePerSample = 0.48665ms; SamplesPerSecond = 2054
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 261- 270 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70768095; EvalErr[0]PerSample = 0.54400000; TotalTime = 0.12138s; TotalTimePerSample = 0.48553ms; SamplesPerSecond = 2059
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 271- 280 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69744379; EvalErr[0]PerSample = 0.52800000; TotalTime = 0.11718s; TotalTimePerSample = 0.46872ms; SamplesPerSecond = 2133
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 281- 290 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69266187; EvalErr[0]PerSample = 0.44800000; TotalTime = 0.11796s; TotalTimePerSample = 0.47183ms; SamplesPerSecond = 2119
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 291- 300 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69347266; EvalErr[0]PerSample = 0.49600000; TotalTime = 0.12167s; TotalTimePerSample = 0.48668ms; SamplesPerSecond = 2054
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 301- 310 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69257409; EvalErr[0]PerSample = 0.54000000; TotalTime = 0.12564s; TotalTimePerSample = 0.50255ms; SamplesPerSecond = 1989
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 311- 320 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.68625741; EvalErr[0]PerSample = 0.38000000; TotalTime = 0.12551s; TotalTimePerSample = 0.50204ms; SamplesPerSecond = 1991
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 321- 330 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69064011; EvalErr[0]PerSample = 0.46800000; TotalTime = 0.12341s; TotalTimePerSample = 0.49363ms; SamplesPerSecond = 2025
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 331- 340 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.70192154; EvalErr[0]PerSample = 0.46000000; TotalTime = 0.12370s; TotalTimePerSample = 0.49480ms; SamplesPerSecond = 2021
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 341- 350 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.69058912; EvalErr[0]PerSample = 0.52000000; TotalTime = 0.12358s; TotalTimePerSample = 0.49434ms; SamplesPerSecond = 2022
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 351- 360 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.67041492; EvalErr[0]PerSample = 0.39200000; TotalTime = 0.12207s; TotalTimePerSample = 0.48827ms; SamplesPerSecond = 2048
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 361- 370 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.65913973; EvalErr[0]PerSample = 0.35600000; TotalTime = 0.12947s; TotalTimePerSample = 0.51787ms; SamplesPerSecond = 1930
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 371- 380 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.63919877; EvalErr[0]PerSample = 0.36400000; TotalTime = 0.14402s; TotalTimePerSample = 0.57607ms; SamplesPerSecond = 1735
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 381- 390 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.61293883; EvalErr[0]PerSample = 0.19200000; TotalTime = 0.14409s; TotalTimePerSample = 0.57637ms; SamplesPerSecond = 1735
+MPI Rank 3:  Epoch[ 1 of 4]-Minibatch[ 391- 400 of -1]: SamplesSeen = 250; TrainLossPerSample =  0.55255352; EvalErr[0]PerSample = 0.18800000; TotalTime = 0.14499s; TotalTimePerSample = 0.57995ms; SamplesPerSecond = 1724
+MPI Rank 3: Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 0.70019555; EvalErrPerSample = 0.4735; Ave LearnRatePerSample = 0.01999999955; EpochTime=5.369658
+MPI Rank 3: Starting Epoch 2: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 3: starting epoch 1 at record count 10000, and file position 0
+MPI Rank 3: already there from last epoch
+MPI Rank 3: 
+MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.50774625; EvalErr[0]PerSample = 0.24000000; TotalTime = 0.13824s; TotalTimePerSample = 0.55294ms; SamplesPerSecond = 1808
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.43388927; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.13875s; TotalTimePerSample = 0.55502ms; SamplesPerSecond = 1801
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.36674870; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14007s; TotalTimePerSample = 0.56027ms; SamplesPerSecond = 1784
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.33768765; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.13713s; TotalTimePerSample = 0.54853ms; SamplesPerSecond = 1823
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.30320946; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12833s; TotalTimePerSample = 0.51332ms; SamplesPerSecond = 1948
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.29576043; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.14854s; TotalTimePerSample = 0.59414ms; SamplesPerSecond = 1683
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24924491; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14657s; TotalTimePerSample = 0.58628ms; SamplesPerSecond = 1705
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.24632415; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14366s; TotalTimePerSample = 0.57462ms; SamplesPerSecond = 1740
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20943158; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13604s; TotalTimePerSample = 0.54416ms; SamplesPerSecond = 1837
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19115996; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13442s; TotalTimePerSample = 0.53768ms; SamplesPerSecond = 1859
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17923231; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13593s; TotalTimePerSample = 0.54372ms; SamplesPerSecond = 1839
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17075422; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.13592s; TotalTimePerSample = 0.54368ms; SamplesPerSecond = 1839
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14442371; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.13116s; TotalTimePerSample = 0.52464ms; SamplesPerSecond = 1906
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17753819; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.13549s; TotalTimePerSample = 0.54197ms; SamplesPerSecond = 1845
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15087855; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.13867s; TotalTimePerSample = 0.55469ms; SamplesPerSecond = 1802
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19253023; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13507s; TotalTimePerSample = 0.54028ms; SamplesPerSecond = 1850
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17830684; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.14524s; TotalTimePerSample = 0.58098ms; SamplesPerSecond = 1721
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15115428; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.14260s; TotalTimePerSample = 0.57040ms; SamplesPerSecond = 1753
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19135968; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14288s; TotalTimePerSample = 0.57152ms; SamplesPerSecond = 1749
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.21491485; EvalErr[0]PerSample = 0.10400000; TotalTime = 0.14526s; TotalTimePerSample = 0.58103ms; SamplesPerSecond = 1721
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18682346; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.13489s; TotalTimePerSample = 0.53955ms; SamplesPerSecond = 1853
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18483205; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14344s; TotalTimePerSample = 0.57375ms; SamplesPerSecond = 1742
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14684504; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.14543s; TotalTimePerSample = 0.58172ms; SamplesPerSecond = 1719
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15322115; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.14576s; TotalTimePerSample = 0.58304ms; SamplesPerSecond = 1715
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19882571; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13993s; TotalTimePerSample = 0.55974ms; SamplesPerSecond = 1786
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13683833; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.13986s; TotalTimePerSample = 0.55944ms; SamplesPerSecond = 1787
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18621188; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.13396s; TotalTimePerSample = 0.53586ms; SamplesPerSecond = 1866
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19408048; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.13196s; TotalTimePerSample = 0.52782ms; SamplesPerSecond = 1894
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17298137; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.14577s; TotalTimePerSample = 0.58307ms; SamplesPerSecond = 1715
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13265130; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.13253s; TotalTimePerSample = 0.53014ms; SamplesPerSecond = 1886
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17627178; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.13034s; TotalTimePerSample = 0.52135ms; SamplesPerSecond = 1918
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12734628; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12706s; TotalTimePerSample = 0.50823ms; SamplesPerSecond = 1967
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15108451; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12629s; TotalTimePerSample = 0.50514ms; SamplesPerSecond = 1979
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19729184; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12368s; TotalTimePerSample = 0.49470ms; SamplesPerSecond = 2021
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12857332; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12167s; TotalTimePerSample = 0.48667ms; SamplesPerSecond = 2054
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13867804; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12142s; TotalTimePerSample = 0.48569ms; SamplesPerSecond = 2058
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12786050; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12244s; TotalTimePerSample = 0.48977ms; SamplesPerSecond = 2041
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16643303; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12588s; TotalTimePerSample = 0.50352ms; SamplesPerSecond = 1986
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20440409; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12226s; TotalTimePerSample = 0.48902ms; SamplesPerSecond = 2044
+MPI Rank 3:  Epoch[ 2 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14566238; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12138s; TotalTimePerSample = 0.48551ms; SamplesPerSecond = 2059
+MPI Rank 3: Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 0.20373029; EvalErrPerSample = 0.0827; Ave LearnRatePerSample = 0.00800000038; EpochTime=5.454936
+MPI Rank 3: Starting Epoch 3: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 3: starting epoch 2 at record count 20000, and file position 0
+MPI Rank 3: already there from last epoch
+MPI Rank 3: 
+MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12590085; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12023s; TotalTimePerSample = 0.48090ms; SamplesPerSecond = 2079
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17780229; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12465s; TotalTimePerSample = 0.49858ms; SamplesPerSecond = 2005
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14417637; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12297s; TotalTimePerSample = 0.49187ms; SamplesPerSecond = 2033
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15796895; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12257s; TotalTimePerSample = 0.49028ms; SamplesPerSecond = 2039
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17002999; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12288s; TotalTimePerSample = 0.49151ms; SamplesPerSecond = 2034
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18262114; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11518s; TotalTimePerSample = 0.46071ms; SamplesPerSecond = 2170
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14643694; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11587s; TotalTimePerSample = 0.46348ms; SamplesPerSecond = 2157
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18030528; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12357s; TotalTimePerSample = 0.49427ms; SamplesPerSecond = 2023
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15846150; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12249s; TotalTimePerSample = 0.48998ms; SamplesPerSecond = 2040
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14486534; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11970s; TotalTimePerSample = 0.47880ms; SamplesPerSecond = 2088
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13469093; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.11866s; TotalTimePerSample = 0.47462ms; SamplesPerSecond = 2106
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13720019; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12676s; TotalTimePerSample = 0.50704ms; SamplesPerSecond = 1972
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11641295; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11993s; TotalTimePerSample = 0.47971ms; SamplesPerSecond = 2084
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16786647; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.11853s; TotalTimePerSample = 0.47412ms; SamplesPerSecond = 2109
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12811514; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12430s; TotalTimePerSample = 0.49721ms; SamplesPerSecond = 2011
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17257851; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.11774s; TotalTimePerSample = 0.47097ms; SamplesPerSecond = 2123
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17623656; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12213s; TotalTimePerSample = 0.48850ms; SamplesPerSecond = 2047
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14121117; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12045s; TotalTimePerSample = 0.48180ms; SamplesPerSecond = 2075
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19243443; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12144s; TotalTimePerSample = 0.48574ms; SamplesPerSecond = 2058
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20908161; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12909s; TotalTimePerSample = 0.51637ms; SamplesPerSecond = 1936
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18472067; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12856s; TotalTimePerSample = 0.51422ms; SamplesPerSecond = 1944
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18185535; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12229s; TotalTimePerSample = 0.48917ms; SamplesPerSecond = 2044
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14074205; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.13026s; TotalTimePerSample = 0.52105ms; SamplesPerSecond = 1919
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14871620; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12010s; TotalTimePerSample = 0.48041ms; SamplesPerSecond = 2081
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20299704; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.12197s; TotalTimePerSample = 0.48786ms; SamplesPerSecond = 2049
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12852038; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12039s; TotalTimePerSample = 0.48156ms; SamplesPerSecond = 2076
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18660439; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12410s; TotalTimePerSample = 0.49640ms; SamplesPerSecond = 2014
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19575997; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12024s; TotalTimePerSample = 0.48097ms; SamplesPerSecond = 2079
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16667676; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12499s; TotalTimePerSample = 0.49996ms; SamplesPerSecond = 2000
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12526169; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12472s; TotalTimePerSample = 0.49889ms; SamplesPerSecond = 2004
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17392131; EvalErr[0]PerSample = 0.08800000; TotalTime = 0.12303s; TotalTimePerSample = 0.49213ms; SamplesPerSecond = 2031
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12281615; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12350s; TotalTimePerSample = 0.49400ms; SamplesPerSecond = 2024
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14759390; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.11582s; TotalTimePerSample = 0.46327ms; SamplesPerSecond = 2158
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19801300; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12505s; TotalTimePerSample = 0.50019ms; SamplesPerSecond = 1999
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12593395; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12306s; TotalTimePerSample = 0.49225ms; SamplesPerSecond = 2031
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13756617; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12046s; TotalTimePerSample = 0.48184ms; SamplesPerSecond = 2075
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12838526; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12798s; TotalTimePerSample = 0.51194ms; SamplesPerSecond = 1953
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16654369; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.13400s; TotalTimePerSample = 0.53600ms; SamplesPerSecond = 1865
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20658951; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12144s; TotalTimePerSample = 0.48578ms; SamplesPerSecond = 2058
+MPI Rank 3:  Epoch[ 3 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14583322; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.11522s; TotalTimePerSample = 0.46087ms; SamplesPerSecond = 2169
+MPI Rank 3: Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 0.15948618; EvalErrPerSample = 0.0766; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.926698
+MPI Rank 3: Starting Epoch 4: learning rate per sample = 0.008000  effective momentum = 0.900000 
+MPI Rank 3: starting epoch 3 at record count 30000, and file position 0
+MPI Rank 3: already there from last epoch
+MPI Rank 3: 
+MPI Rank 3: Starting minibatch loop, DataParallelSGD training (MyRank = 3, NumNodes = 4, NumGradientBits = 32).
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[   1-  10 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12371232; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12049s; TotalTimePerSample = 0.48194ms; SamplesPerSecond = 2074
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  11-  20 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18070515; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12028s; TotalTimePerSample = 0.48113ms; SamplesPerSecond = 2078
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  21-  30 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14239730; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12702s; TotalTimePerSample = 0.50807ms; SamplesPerSecond = 1968
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  31-  40 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15630155; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.11959s; TotalTimePerSample = 0.47836ms; SamplesPerSecond = 2090
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  41-  50 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16935525; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11886s; TotalTimePerSample = 0.47544ms; SamplesPerSecond = 2103
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  51-  60 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18198833; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12210s; TotalTimePerSample = 0.48839ms; SamplesPerSecond = 2047
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  61-  70 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14475946; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.11983s; TotalTimePerSample = 0.47932ms; SamplesPerSecond = 2086
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  71-  80 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18021601; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.11840s; TotalTimePerSample = 0.47362ms; SamplesPerSecond = 2111
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  81-  90 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.15849308; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.11869s; TotalTimePerSample = 0.47474ms; SamplesPerSecond = 2106
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[  91- 100 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14474425; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12097s; TotalTimePerSample = 0.48389ms; SamplesPerSecond = 2066
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 101- 110 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13362926; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12025s; TotalTimePerSample = 0.48102ms; SamplesPerSecond = 2078
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 111- 120 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13708299; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12236s; TotalTimePerSample = 0.48943ms; SamplesPerSecond = 2043
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 121- 130 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.11569777; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.12025s; TotalTimePerSample = 0.48100ms; SamplesPerSecond = 2079
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 131- 140 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16892331; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.11957s; TotalTimePerSample = 0.47830ms; SamplesPerSecond = 2090
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 141- 150 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12752162; EvalErr[0]PerSample = 0.04800000; TotalTime = 0.12099s; TotalTimePerSample = 0.48394ms; SamplesPerSecond = 2066
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 151- 160 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17100866; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12234s; TotalTimePerSample = 0.48934ms; SamplesPerSecond = 2043
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 161- 170 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17660425; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.11970s; TotalTimePerSample = 0.47880ms; SamplesPerSecond = 2088
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 171- 180 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14105803; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12238s; TotalTimePerSample = 0.48954ms; SamplesPerSecond = 2042
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 181- 190 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19333552; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.12576s; TotalTimePerSample = 0.50306ms; SamplesPerSecond = 1987
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 191- 200 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20859524; EvalErr[0]PerSample = 0.10000000; TotalTime = 0.14403s; TotalTimePerSample = 0.57612ms; SamplesPerSecond = 1735
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 201- 210 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18499677; EvalErr[0]PerSample = 0.08000000; TotalTime = 0.12854s; TotalTimePerSample = 0.51416ms; SamplesPerSecond = 1944
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 211- 220 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18152438; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12730s; TotalTimePerSample = 0.50919ms; SamplesPerSecond = 1963
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 221- 230 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14037157; EvalErr[0]PerSample = 0.05600000; TotalTime = 0.11977s; TotalTimePerSample = 0.47907ms; SamplesPerSecond = 2087
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 231- 240 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14866862; EvalErr[0]PerSample = 0.07600000; TotalTime = 0.12054s; TotalTimePerSample = 0.48217ms; SamplesPerSecond = 2073
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 241- 250 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20347747; EvalErr[0]PerSample = 0.11200000; TotalTime = 0.11766s; TotalTimePerSample = 0.47064ms; SamplesPerSecond = 2124
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 251- 260 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12815012; EvalErr[0]PerSample = 0.07200000; TotalTime = 0.12102s; TotalTimePerSample = 0.48407ms; SamplesPerSecond = 2065
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 261- 270 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.18672810; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12104s; TotalTimePerSample = 0.48415ms; SamplesPerSecond = 2065
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 271- 280 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19552990; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12194s; TotalTimePerSample = 0.48776ms; SamplesPerSecond = 2050
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 281- 290 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16452642; EvalErr[0]PerSample = 0.06800000; TotalTime = 0.12325s; TotalTimePerSample = 0.49300ms; SamplesPerSecond = 2028
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 291- 300 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12461825; EvalErr[0]PerSample = 0.04400000; TotalTime = 0.12225s; TotalTimePerSample = 0.48901ms; SamplesPerSecond = 2044
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 301- 310 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.17285251; EvalErr[0]PerSample = 0.08400000; TotalTime = 0.12176s; TotalTimePerSample = 0.48705ms; SamplesPerSecond = 2053
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 311- 320 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12253620; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12281s; TotalTimePerSample = 0.49122ms; SamplesPerSecond = 2035
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 321- 330 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14723334; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12368s; TotalTimePerSample = 0.49473ms; SamplesPerSecond = 2021
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 331- 340 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.19789537; EvalErr[0]PerSample = 0.09200000; TotalTime = 0.12262s; TotalTimePerSample = 0.49048ms; SamplesPerSecond = 2038
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 341- 350 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12575877; EvalErr[0]PerSample = 0.05200000; TotalTime = 0.12474s; TotalTimePerSample = 0.49897ms; SamplesPerSecond = 2004
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 351- 360 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.13745928; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12588s; TotalTimePerSample = 0.50350ms; SamplesPerSecond = 1986
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 361- 370 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.12839652; EvalErr[0]PerSample = 0.06000000; TotalTime = 0.12028s; TotalTimePerSample = 0.48113ms; SamplesPerSecond = 2078
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 371- 380 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.16647280; EvalErr[0]PerSample = 0.09600000; TotalTime = 0.12155s; TotalTimePerSample = 0.48618ms; SamplesPerSecond = 2056
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 381- 390 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.20679434; EvalErr[0]PerSample = 0.11600000; TotalTime = 0.12413s; TotalTimePerSample = 0.49654ms; SamplesPerSecond = 2013
+MPI Rank 3:  Epoch[ 4 of 4]-Minibatch[ 391- 400 of 400]: SamplesSeen = 250; TrainLossPerSample =  0.14585245; EvalErr[0]PerSample = 0.06400000; TotalTime = 0.12588s; TotalTimePerSample = 0.50353ms; SamplesPerSecond = 1985
+MPI Rank 3: Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.15914931; EvalErrPerSample = 0.0767; Ave LearnRatePerSample = 0.00800000038; EpochTime=4.931393
+MPI Rank 3: CNTKCommandTrainEnd: SimpleMultiGPU
+MPI Rank 3: COMPLETED
+MPI Rank 3: ~MPIWrapper
diff --git a/Tests/Speech/DNN/DiscriminativePreTraining/baseline.gpu.txt b/Tests/Speech/DNN/DiscriminativePreTraining/baseline.gpu.txt
index f0f1c5727..dc101bc81 100644
--- a/Tests/Speech/DNN/DiscriminativePreTraining/baseline.gpu.txt
+++ b/Tests/Speech/DNN/DiscriminativePreTraining/baseline.gpu.txt
@@ -1,13 +1,13 @@
-=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/cntk_dpt.config RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0
-running on localhost at 2015/10/12 18:49:16
-command line options: 
-configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/cntk_dpt.config RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 
+=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/cntk_dpt.config RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining DeviceId=0
+running on localhost at 2015/10/24 12:51:56
+command line: 
+/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/cntk_dpt.config RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining DeviceId=0 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
 deviceId=$DeviceId$
 command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain
-ndlMacros=$DataDir$/ndl/macros.txt
+ndlMacros=$ConfigDir$/macros.txt
 GlobalMean=GlobalStats/mean.363
 GlobalInvStd=GlobalStats/var.363
 GlobalPrior=GlobalStats/prior.132
@@ -25,7 +25,7 @@ DPT_Pre1=[
     action=train
     modelPath=$RunDir$/models/Pre1/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=$DataDir$/ndl/dnn_1layer.txt
+        networkDescription=$ConfigDir$/dnn_1layer.txt
     ]
 ]
 AddLayer2=[    
@@ -34,13 +34,13 @@ AddLayer2=[
     NewLayer=2
     CurrModel=$RunDir$/models/Pre1/cntkSpeech
     NewModel=$RunDir$/models/Pre2/cntkSpeech.0
-    editPath=$DataDir$/ndl/add_layer.mel
+    editPath=$ConfigDir$/add_layer.mel
 ]
 DPT_Pre2=[
     action=train
     modelPath=$RunDir$/models/Pre2/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=$DataDir$/ndl/dnn_1layer.txt
+        networkDescription=$ConfigDir$/dnn_1layer.txt
     ]
 ]
 AddLayer3=[    
@@ -49,7 +49,7 @@ AddLayer3=[
     NewLayer=3
     CurrModel=$RunDir$/models/Pre2/cntkSpeech
     NewModel=$RunDir$/models/cntkSpeech.0
-    editPath=$DataDir$/ndl/add_layer.mel
+    editPath=$ConfigDir$/add_layer.mel
 ]
 speechTrain=[
     action=train
@@ -57,7 +57,7 @@ speechTrain=[
     deviceId=$DeviceId$
     traceLevel=1
      NDLNetworkBuilder=[
-        networkDescription=$DataDir$/ndl/dnn.txt
+        networkDescription=$ConfigDir$/dnn.txt
     ]
     SGD=[
         epochSize=81920
@@ -90,8 +90,9 @@ reader=[
       labelType=Category
   ]
 ]
-RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu
+RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu
 DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining
 DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
@@ -100,7 +101,7 @@ DeviceId=0
 precision=float
 deviceId=0
 command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain
-ndlMacros=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/macros.txt
+ndlMacros=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/macros.txt
 GlobalMean=GlobalStats/mean.363
 GlobalInvStd=GlobalStats/var.363
 GlobalPrior=GlobalStats/prior.132
@@ -116,41 +117,41 @@ SGD=[
 ]
 DPT_Pre1=[
     action=train
-    modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+    modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn_1layer.txt
+        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn_1layer.txt
     ]
 ]
 AddLayer2=[    
     action=edit
     CurrLayer=1
     NewLayer=2
-    CurrModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
-    NewModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0
-    editPath=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/add_layer.mel
+    CurrModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+    NewModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0
+    editPath=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/add_layer.mel
 ]
 DPT_Pre2=[
     action=train
-    modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+    modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn_1layer.txt
+        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn_1layer.txt
     ]
 ]
 AddLayer3=[    
     action=edit
     CurrLayer=2
     NewLayer=3
-    CurrModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
-    NewModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0
-    editPath=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/add_layer.mel
+    CurrModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+    NewModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0
+    editPath=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/add_layer.mel
 ]
 speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
+    modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
     deviceId=0
     traceLevel=1
      NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn.txt
+        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn.txt
     ]
     SGD=[
         epochSize=81920
@@ -183,8 +184,9 @@ reader=[
       labelType=Category
   ]
 ]
-RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu
+RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu
 DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining
 DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
@@ -194,43 +196,44 @@ configparameters: cntk_dpt.config:AddLayer2=[
     action=edit
     CurrLayer=1
     NewLayer=2
-    CurrModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
-    NewModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0
-    editPath=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/add_layer.mel
+    CurrModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+    NewModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0
+    editPath=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/add_layer.mel
 ]
 
 configparameters: cntk_dpt.config:AddLayer3=[    
     action=edit
     CurrLayer=2
     NewLayer=3
-    CurrModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
-    NewModel=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0
-    editPath=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/add_layer.mel
+    CurrModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+    NewModel=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0
+    editPath=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/add_layer.mel
 ]
 
 configparameters: cntk_dpt.config:command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain
+configparameters: cntk_dpt.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining
 configparameters: cntk_dpt.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
 configparameters: cntk_dpt.config:deviceId=0
 configparameters: cntk_dpt.config:DPT_Pre1=[
     action=train
-    modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+    modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn_1layer.txt
+        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn_1layer.txt
     ]
 ]
 
 configparameters: cntk_dpt.config:DPT_Pre2=[
     action=train
-    modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+    modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn_1layer.txt
+        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn_1layer.txt
     ]
 ]
 
 configparameters: cntk_dpt.config:GlobalInvStd=GlobalStats/var.363
 configparameters: cntk_dpt.config:GlobalMean=GlobalStats/mean.363
 configparameters: cntk_dpt.config:GlobalPrior=GlobalStats/prior.132
-configparameters: cntk_dpt.config:ndlMacros=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/macros.txt
+configparameters: cntk_dpt.config:ndlMacros=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/macros.txt
 configparameters: cntk_dpt.config:precision=float
 configparameters: cntk_dpt.config:reader=[
   readerType=HTKMLFReader
@@ -251,7 +254,7 @@ configparameters: cntk_dpt.config:reader=[
   ]
 ]
 
-configparameters: cntk_dpt.config:RunDir=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu
+configparameters: cntk_dpt.config:RunDir=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu
 configparameters: cntk_dpt.config:SGD=[
     epochSize=81920
     minibatchSize=256
@@ -264,11 +267,11 @@ configparameters: cntk_dpt.config:SGD=[
 
 configparameters: cntk_dpt.config:speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
+    modelPath=/tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
     deviceId=0
     traceLevel=1
      NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/Data/ndl/dnn.txt
+        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/DNN/DiscriminativePreTraining/dnn.txt
     ]
     SGD=[
         epochSize=81920
@@ -288,11 +291,11 @@ configparameters: cntk_dpt.config:traceLevel=1
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: DPT_Pre1 AddLayer2 DPT_Pre2 AddLayer3 speechTrain 
 precision = float
-CNTKModelPath: /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+CNTKModelPath: /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
 CNTKCommandTrainInfo: DPT_Pre1 : 2
-CNTKModelPath: /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+CNTKModelPath: /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
 CNTKCommandTrainInfo: DPT_Pre2 : 2
-CNTKModelPath: /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
+CNTKModelPath: /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
 CNTKCommandTrainInfo: speechTrain : 4
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 8
 CNTKCommandTrainBegin: DPT_Pre1
@@ -400,6 +403,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
 
+Validating for node cr. 6 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 1]
@@ -441,7 +462,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL1.W = LearnableParameter -> [512, 363]
@@ -502,6 +523,25 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1]
 
+Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -543,7 +583,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 6 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 1]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -601,6 +641,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
 
+Validating for node Err. 6 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 1]
@@ -621,6 +679,7 @@ Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1])
 
 7 out of 15 nodes do not share the minibatch layout with the input data.
 
+SetUniformRandomValue (GPU): creating curand object with seed 1
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
 No PreCompute nodes found, skipping PreCompute step
@@ -630,79 +689,79 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
 Starting minibatch loop.
-EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once. 
- Epoch[ 1 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  3.74183846; EvalErr[0]PerSample = 0.80195313; TotalTime = 0.30483s; TotalTimePerSample = 0.11907ms; SamplesPerSecond = 8398
- Epoch[ 1 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.91124763; EvalErr[0]PerSample = 0.70898438; TotalTime = 0.12917s; TotalTimePerSample = 0.05046ms; SamplesPerSecond = 19818
- Epoch[ 1 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.58015976; EvalErr[0]PerSample = 0.66640625; TotalTime = 0.12870s; TotalTimePerSample = 0.05027ms; SamplesPerSecond = 19891
- Epoch[ 1 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.27427139; EvalErr[0]PerSample = 0.58750000; TotalTime = 0.12889s; TotalTimePerSample = 0.05035ms; SamplesPerSecond = 19861
- Epoch[ 1 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.05503616; EvalErr[0]PerSample = 0.56093750; TotalTime = 0.12856s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19912
- Epoch[ 1 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.91055145; EvalErr[0]PerSample = 0.52812500; TotalTime = 0.12907s; TotalTimePerSample = 0.05042ms; SamplesPerSecond = 19833
- Epoch[ 1 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.81562653; EvalErr[0]PerSample = 0.51171875; TotalTime = 0.12874s; TotalTimePerSample = 0.05029ms; SamplesPerSecond = 19884
- Epoch[ 1 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.68803253; EvalErr[0]PerSample = 0.48476562; TotalTime = 0.12379s; TotalTimePerSample = 0.04836ms; SamplesPerSecond = 20680
- Epoch[ 1 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.57382050; EvalErr[0]PerSample = 0.45429687; TotalTime = 0.12941s; TotalTimePerSample = 0.05055ms; SamplesPerSecond = 19781
- Epoch[ 1 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.62090302; EvalErr[0]PerSample = 0.47304687; TotalTime = 0.12857s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19911
+EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+ Epoch[ 1 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  3.74183807; EvalErr[0]PerSample = 0.80195313; TotalTime = 0.15056s; TotalTimePerSample = 0.05881ms; SamplesPerSecond = 17002
+ Epoch[ 1 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.91124802; EvalErr[0]PerSample = 0.70898438; TotalTime = 0.06815s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37561
+ Epoch[ 1 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.58016052; EvalErr[0]PerSample = 0.66640625; TotalTime = 0.06821s; TotalTimePerSample = 0.02665ms; SamplesPerSecond = 37530
+ Epoch[ 1 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.27427139; EvalErr[0]PerSample = 0.58750000; TotalTime = 0.06811s; TotalTimePerSample = 0.02660ms; SamplesPerSecond = 37587
+ Epoch[ 1 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.05503540; EvalErr[0]PerSample = 0.56093750; TotalTime = 0.06794s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37680
+ Epoch[ 1 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.91055145; EvalErr[0]PerSample = 0.52812500; TotalTime = 0.06802s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37638
+ Epoch[ 1 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.81562653; EvalErr[0]PerSample = 0.51171875; TotalTime = 0.06809s; TotalTimePerSample = 0.02660ms; SamplesPerSecond = 37595
+ Epoch[ 1 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.68803253; EvalErr[0]PerSample = 0.48476562; TotalTime = 0.06801s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37640
+ Epoch[ 1 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.57382050; EvalErr[0]PerSample = 0.45429687; TotalTime = 0.06800s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37646
+ Epoch[ 1 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.62090149; EvalErr[0]PerSample = 0.47304687; TotalTime = 0.06795s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37673
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.59272614; EvalErr[0]PerSample = 0.47500000; TotalTime = 0.12941s; TotalTimePerSample = 0.05055ms; SamplesPerSecond = 19781
- Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.51520386; EvalErr[0]PerSample = 0.44531250; TotalTime = 0.12911s; TotalTimePerSample = 0.05043ms; SamplesPerSecond = 19828
- Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.49181976; EvalErr[0]PerSample = 0.45039062; TotalTime = 0.10931s; TotalTimePerSample = 0.04270ms; SamplesPerSecond = 23418
- Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.53703613; EvalErr[0]PerSample = 0.44804688; TotalTime = 0.09591s; TotalTimePerSample = 0.03746ms; SamplesPerSecond = 26691
- Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.43095093; EvalErr[0]PerSample = 0.41640625; TotalTime = 0.09606s; TotalTimePerSample = 0.03753ms; SamplesPerSecond = 26648
- Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41503601; EvalErr[0]PerSample = 0.40078125; TotalTime = 0.09662s; TotalTimePerSample = 0.03774ms; SamplesPerSecond = 26494
- Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.38912659; EvalErr[0]PerSample = 0.41132812; TotalTime = 0.09588s; TotalTimePerSample = 0.03745ms; SamplesPerSecond = 26700
- Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41208191; EvalErr[0]PerSample = 0.42226562; TotalTime = 0.09560s; TotalTimePerSample = 0.03734ms; SamplesPerSecond = 26779
- Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39966125; EvalErr[0]PerSample = 0.40664062; TotalTime = 0.09558s; TotalTimePerSample = 0.03734ms; SamplesPerSecond = 26782
- Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.42728577; EvalErr[0]PerSample = 0.42617187; TotalTime = 0.09572s; TotalTimePerSample = 0.03739ms; SamplesPerSecond = 26745
- Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41336365; EvalErr[0]PerSample = 0.42304687; TotalTime = 0.09572s; TotalTimePerSample = 0.03739ms; SamplesPerSecond = 26744
- Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33197937; EvalErr[0]PerSample = 0.39960937; TotalTime = 0.09589s; TotalTimePerSample = 0.03746ms; SamplesPerSecond = 26698
- Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28578796; EvalErr[0]PerSample = 0.38671875; TotalTime = 0.09591s; TotalTimePerSample = 0.03747ms; SamplesPerSecond = 26691
- Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.34131775; EvalErr[0]PerSample = 0.40937500; TotalTime = 0.09552s; TotalTimePerSample = 0.03731ms; SamplesPerSecond = 26800
- Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32666016; EvalErr[0]PerSample = 0.39648438; TotalTime = 0.09573s; TotalTimePerSample = 0.03740ms; SamplesPerSecond = 26741
- Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21426086; EvalErr[0]PerSample = 0.37226562; TotalTime = 0.09610s; TotalTimePerSample = 0.03754ms; SamplesPerSecond = 26638
- Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23750610; EvalErr[0]PerSample = 0.37382813; TotalTime = 0.10318s; TotalTimePerSample = 0.04031ms; SamplesPerSecond = 24810
- Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.29967957; EvalErr[0]PerSample = 0.39062500; TotalTime = 0.12995s; TotalTimePerSample = 0.05076ms; SamplesPerSecond = 19699
- Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21233215; EvalErr[0]PerSample = 0.37343750; TotalTime = 0.12914s; TotalTimePerSample = 0.05044ms; SamplesPerSecond = 19823
- Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20534973; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.12942s; TotalTimePerSample = 0.05056ms; SamplesPerSecond = 19779
- Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23558655; EvalErr[0]PerSample = 0.37187500; TotalTime = 0.12904s; TotalTimePerSample = 0.05041ms; SamplesPerSecond = 19838
- Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.25517273; EvalErr[0]PerSample = 0.37890625; TotalTime = 0.11876s; TotalTimePerSample = 0.04639ms; SamplesPerSecond = 21555
-Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.6294192; EvalErrPerSample = 0.46010742; Ave LearnRatePerSample = 0.003125000047; EpochTime=5.361751
+ Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.59272461; EvalErr[0]PerSample = 0.47500000; TotalTime = 0.06816s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37559
+ Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.51520386; EvalErr[0]PerSample = 0.44531250; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37605
+ Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.49181976; EvalErr[0]PerSample = 0.45039062; TotalTime = 0.06793s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37685
+ Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.53703613; EvalErr[0]PerSample = 0.44804688; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37605
+ Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.43095398; EvalErr[0]PerSample = 0.41640625; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37623
+ Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41503601; EvalErr[0]PerSample = 0.40078125; TotalTime = 0.06805s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37617
+ Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.38913574; EvalErr[0]PerSample = 0.41132812; TotalTime = 0.06795s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37674
+ Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41207886; EvalErr[0]PerSample = 0.42226562; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37541
+ Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39968262; EvalErr[0]PerSample = 0.40664062; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37625
+ Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.42729187; EvalErr[0]PerSample = 0.42617187; TotalTime = 0.06792s; TotalTimePerSample = 0.02653ms; SamplesPerSecond = 37690
+ Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41336365; EvalErr[0]PerSample = 0.42343750; TotalTime = 0.06812s; TotalTimePerSample = 0.02661ms; SamplesPerSecond = 37578
+ Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33186951; EvalErr[0]PerSample = 0.39960937; TotalTime = 0.06800s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37647
+ Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28581238; EvalErr[0]PerSample = 0.38710937; TotalTime = 0.06803s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37628
+ Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.34127502; EvalErr[0]PerSample = 0.40976563; TotalTime = 0.06799s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37652
+ Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32666016; EvalErr[0]PerSample = 0.39726563; TotalTime = 0.06795s; TotalTimePerSample = 0.02654ms; SamplesPerSecond = 37673
+ Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21437378; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.06821s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37532
+ Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23749695; EvalErr[0]PerSample = 0.37343750; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37626
+ Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.29956665; EvalErr[0]PerSample = 0.39023438; TotalTime = 0.06814s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37570
+ Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21198120; EvalErr[0]PerSample = 0.37382813; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37624
+ Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20528259; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.06800s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37648
+ Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23613586; EvalErr[0]PerSample = 0.37343750; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37608
+ Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.25615234; EvalErr[0]PerSample = 0.38164063; TotalTime = 0.06263s; TotalTimePerSample = 0.02446ms; SamplesPerSecond = 40877
+Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.6294507; EvalErrPerSample = 0.46030274; Ave LearnRatePerSample = 0.003125000047; EpochTime=3.113533
 Starting Epoch 2: learning rate per sample = 0.003125  effective momentum = 0.900000 
 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23276577; EvalErr[0]PerSample = 0.38125000; TotalTime = 0.13037s; TotalTimePerSample = 0.05093ms; SamplesPerSecond = 19635
- Epoch[ 2 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20353279; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.12890s; TotalTimePerSample = 0.05035ms; SamplesPerSecond = 19860
- Epoch[ 2 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28632336; EvalErr[0]PerSample = 0.37734375; TotalTime = 0.12453s; TotalTimePerSample = 0.04864ms; SamplesPerSecond = 20557
- Epoch[ 2 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23058014; EvalErr[0]PerSample = 0.37812500; TotalTime = 0.09562s; TotalTimePerSample = 0.03735ms; SamplesPerSecond = 26772
- Epoch[ 2 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18196945; EvalErr[0]PerSample = 0.35429688; TotalTime = 0.09557s; TotalTimePerSample = 0.03733ms; SamplesPerSecond = 26785
- Epoch[ 2 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28158035; EvalErr[0]PerSample = 0.38007812; TotalTime = 0.09562s; TotalTimePerSample = 0.03735ms; SamplesPerSecond = 26773
- Epoch[ 2 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22469864; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.09554s; TotalTimePerSample = 0.03732ms; SamplesPerSecond = 26795
- Epoch[ 2 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17930145; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.09564s; TotalTimePerSample = 0.03736ms; SamplesPerSecond = 26767
- Epoch[ 2 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23973160; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.09546s; TotalTimePerSample = 0.03729ms; SamplesPerSecond = 26817
- Epoch[ 2 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18514709; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.09566s; TotalTimePerSample = 0.03737ms; SamplesPerSecond = 26762
+ Epoch[ 2 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23230944; EvalErr[0]PerSample = 0.38320312; TotalTime = 0.06923s; TotalTimePerSample = 0.02704ms; SamplesPerSecond = 36980
+ Epoch[ 2 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20511351; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.06817s; TotalTimePerSample = 0.02663ms; SamplesPerSecond = 37551
+ Epoch[ 2 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28783760; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.06806s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37612
+ Epoch[ 2 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22809334; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37539
+ Epoch[ 2 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18090286; EvalErr[0]PerSample = 0.35468750; TotalTime = 0.06809s; TotalTimePerSample = 0.02660ms; SamplesPerSecond = 37598
+ Epoch[ 2 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28175354; EvalErr[0]PerSample = 0.37695312; TotalTime = 0.06818s; TotalTimePerSample = 0.02663ms; SamplesPerSecond = 37546
+ Epoch[ 2 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22251205; EvalErr[0]PerSample = 0.37382813; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37542
+ Epoch[ 2 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17863007; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.06815s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37566
+ Epoch[ 2 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23061218; EvalErr[0]PerSample = 0.35742188; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37540
+ Epoch[ 2 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18048782; EvalErr[0]PerSample = 0.37578125; TotalTime = 0.06799s; TotalTimePerSample = 0.02656ms; SamplesPerSecond = 37654
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20197525; EvalErr[0]PerSample = 0.36171875; TotalTime = 0.09590s; TotalTimePerSample = 0.03746ms; SamplesPerSecond = 26695
- Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18739471; EvalErr[0]PerSample = 0.35312500; TotalTime = 0.09610s; TotalTimePerSample = 0.03754ms; SamplesPerSecond = 26637
- Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16798859; EvalErr[0]PerSample = 0.35742188; TotalTime = 0.09603s; TotalTimePerSample = 0.03751ms; SamplesPerSecond = 26657
- Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13375397; EvalErr[0]PerSample = 0.35273437; TotalTime = 0.09650s; TotalTimePerSample = 0.03769ms; SamplesPerSecond = 26529
- Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09628754; EvalErr[0]PerSample = 0.31992188; TotalTime = 0.09675s; TotalTimePerSample = 0.03779ms; SamplesPerSecond = 26459
- Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10226898; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.09621s; TotalTimePerSample = 0.03758ms; SamplesPerSecond = 26608
- Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20214386; EvalErr[0]PerSample = 0.36015625; TotalTime = 0.09606s; TotalTimePerSample = 0.03753ms; SamplesPerSecond = 26648
- Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17007599; EvalErr[0]PerSample = 0.36015625; TotalTime = 0.09616s; TotalTimePerSample = 0.03756ms; SamplesPerSecond = 26621
- Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12343140; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.09620s; TotalTimePerSample = 0.03758ms; SamplesPerSecond = 26611
- Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12009735; EvalErr[0]PerSample = 0.34570312; TotalTime = 0.09589s; TotalTimePerSample = 0.03746ms; SamplesPerSecond = 26697
- Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10230255; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.09559s; TotalTimePerSample = 0.03734ms; SamplesPerSecond = 26780
- Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12454529; EvalErr[0]PerSample = 0.34179688; TotalTime = 0.09594s; TotalTimePerSample = 0.03748ms; SamplesPerSecond = 26682
- Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13382874; EvalErr[0]PerSample = 0.34921875; TotalTime = 0.09603s; TotalTimePerSample = 0.03751ms; SamplesPerSecond = 26657
- Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27786255; EvalErr[0]PerSample = 0.39296875; TotalTime = 0.09608s; TotalTimePerSample = 0.03753ms; SamplesPerSecond = 26644
- Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16416626; EvalErr[0]PerSample = 0.34960938; TotalTime = 0.09607s; TotalTimePerSample = 0.03753ms; SamplesPerSecond = 26647
- Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12371216; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.09599s; TotalTimePerSample = 0.03750ms; SamplesPerSecond = 26668
- Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13847351; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.09585s; TotalTimePerSample = 0.03744ms; SamplesPerSecond = 26707
- Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14408264; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.09598s; TotalTimePerSample = 0.03749ms; SamplesPerSecond = 26671
- Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06380920; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.09587s; TotalTimePerSample = 0.03745ms; SamplesPerSecond = 26702
- Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09358521; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.09592s; TotalTimePerSample = 0.03747ms; SamplesPerSecond = 26690
- Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08025513; EvalErr[0]PerSample = 0.33046875; TotalTime = 0.09581s; TotalTimePerSample = 0.03743ms; SamplesPerSecond = 26718
- Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05906372; EvalErr[0]PerSample = 0.32968750; TotalTime = 0.08985s; TotalTimePerSample = 0.03510ms; SamplesPerSecond = 28490
-Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.164273; EvalErrPerSample = 0.35511476; Ave LearnRatePerSample = 0.003125000047; EpochTime=3.177082
+ Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19648056; EvalErr[0]PerSample = 0.35976562; TotalTime = 0.06812s; TotalTimePerSample = 0.02661ms; SamplesPerSecond = 37582
+ Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18896942; EvalErr[0]PerSample = 0.35429688; TotalTime = 0.06823s; TotalTimePerSample = 0.02665ms; SamplesPerSecond = 37521
+ Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16628113; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.06815s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37563
+ Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12856445; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.06806s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37613
+ Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10083466; EvalErr[0]PerSample = 0.32617188; TotalTime = 0.06827s; TotalTimePerSample = 0.02667ms; SamplesPerSecond = 37496
+ Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09875336; EvalErr[0]PerSample = 0.33906250; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37610
+ Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18634949; EvalErr[0]PerSample = 0.35820313; TotalTime = 0.06811s; TotalTimePerSample = 0.02661ms; SamplesPerSecond = 37585
+ Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15709991; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37540
+ Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10971069; EvalErr[0]PerSample = 0.34960938; TotalTime = 0.06807s; TotalTimePerSample = 0.02659ms; SamplesPerSecond = 37605
+ Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11317139; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.06824s; TotalTimePerSample = 0.02665ms; SamplesPerSecond = 37516
+ Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08727722; EvalErr[0]PerSample = 0.32578125; TotalTime = 0.06816s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37558
+ Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12295990; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.06804s; TotalTimePerSample = 0.02658ms; SamplesPerSecond = 37626
+ Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12966003; EvalErr[0]PerSample = 0.35078125; TotalTime = 0.06816s; TotalTimePerSample = 0.02663ms; SamplesPerSecond = 37557
+ Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27489319; EvalErr[0]PerSample = 0.39257812; TotalTime = 0.06816s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37559
+ Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17423401; EvalErr[0]PerSample = 0.35156250; TotalTime = 0.06803s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37632
+ Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13240051; EvalErr[0]PerSample = 0.35625000; TotalTime = 0.06813s; TotalTimePerSample = 0.02661ms; SamplesPerSecond = 37574
+ Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13792114; EvalErr[0]PerSample = 0.34335938; TotalTime = 0.06802s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37635
+ Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13433228; EvalErr[0]PerSample = 0.33710937; TotalTime = 0.06814s; TotalTimePerSample = 0.02662ms; SamplesPerSecond = 37568
+ Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05835876; EvalErr[0]PerSample = 0.33710937; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37539
+ Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09596558; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.06819s; TotalTimePerSample = 0.02664ms; SamplesPerSecond = 37540
+ Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08180847; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.06802s; TotalTimePerSample = 0.02657ms; SamplesPerSecond = 37634
+ Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06572876; EvalErr[0]PerSample = 0.33632812; TotalTime = 0.06260s; TotalTimePerSample = 0.02445ms; SamplesPerSecond = 40895
+Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1615628; EvalErrPerSample = 0.35460207; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.185375
 CNTKCommandTrainEnd: DPT_Pre1
 
 
@@ -800,6 +859,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 6 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -841,7 +918,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL1.W = LearnableParameter -> [512, 363]
@@ -902,6 +979,25 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -943,7 +1039,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 6 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1001,6 +1097,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 6 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1069,7 +1183,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node cr. 3 nodes to process in pass 2.
+Validating for node cr. 9 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1142,6 +1256,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1193,6 +1330,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1245,6 +1406,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1296,6 +1481,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1346,6 +1554,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1380,7 +1611,7 @@ htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/gl
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
-Starting from checkpoint. Load Network From File /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0.
+Starting from checkpoint. Load Network From File /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0.
 
 
 Printing Gradient Computation Node Order ... 
@@ -1502,6 +1733,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1553,7 +1807,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL2.W = LearnableParameter -> [512, 512]
@@ -1629,6 +1883,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1680,7 +1958,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 9 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1753,6 +2031,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1787,78 +2088,78 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
 Starting minibatch loop.
- Epoch[ 1 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  4.33646812; EvalErr[0]PerSample = 0.80507812; TotalTime = 0.17076s; TotalTimePerSample = 0.06670ms; SamplesPerSecond = 14991
- Epoch[ 1 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.78729973; EvalErr[0]PerSample = 0.71328125; TotalTime = 0.16588s; TotalTimePerSample = 0.06480ms; SamplesPerSecond = 15432
- Epoch[ 1 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.21825867; EvalErr[0]PerSample = 0.58007812; TotalTime = 0.13480s; TotalTimePerSample = 0.05266ms; SamplesPerSecond = 18991
- Epoch[ 1 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.89405746; EvalErr[0]PerSample = 0.50468750; TotalTime = 0.12949s; TotalTimePerSample = 0.05058ms; SamplesPerSecond = 19769
- Epoch[ 1 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.71779938; EvalErr[0]PerSample = 0.47578125; TotalTime = 0.16601s; TotalTimePerSample = 0.06485ms; SamplesPerSecond = 15420
- Epoch[ 1 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.60265808; EvalErr[0]PerSample = 0.45000000; TotalTime = 0.16532s; TotalTimePerSample = 0.06458ms; SamplesPerSecond = 15484
- Epoch[ 1 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.56439209; EvalErr[0]PerSample = 0.44843750; TotalTime = 0.16557s; TotalTimePerSample = 0.06468ms; SamplesPerSecond = 15461
- Epoch[ 1 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.47621765; EvalErr[0]PerSample = 0.42578125; TotalTime = 0.13552s; TotalTimePerSample = 0.05294ms; SamplesPerSecond = 18890
- Epoch[ 1 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39409637; EvalErr[0]PerSample = 0.40625000; TotalTime = 0.12883s; TotalTimePerSample = 0.05032ms; SamplesPerSecond = 19871
- Epoch[ 1 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.42145081; EvalErr[0]PerSample = 0.42343750; TotalTime = 0.12926s; TotalTimePerSample = 0.05049ms; SamplesPerSecond = 19805
+ Epoch[ 1 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  4.30124588; EvalErr[0]PerSample = 0.80703125; TotalTime = 0.09340s; TotalTimePerSample = 0.03649ms; SamplesPerSecond = 27407
+ Epoch[ 1 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.75448074; EvalErr[0]PerSample = 0.69960937; TotalTime = 0.09098s; TotalTimePerSample = 0.03554ms; SamplesPerSecond = 28139
+ Epoch[ 1 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.20926208; EvalErr[0]PerSample = 0.58515625; TotalTime = 0.09073s; TotalTimePerSample = 0.03544ms; SamplesPerSecond = 28216
+ Epoch[ 1 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.88578110; EvalErr[0]PerSample = 0.50117188; TotalTime = 0.09085s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28176
+ Epoch[ 1 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.71906204; EvalErr[0]PerSample = 0.47773437; TotalTime = 0.09080s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28193
+ Epoch[ 1 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.60130463; EvalErr[0]PerSample = 0.44648437; TotalTime = 0.09085s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28177
+ Epoch[ 1 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.56077118; EvalErr[0]PerSample = 0.45000000; TotalTime = 0.09086s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28175
+ Epoch[ 1 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.47116547; EvalErr[0]PerSample = 0.42460938; TotalTime = 0.09079s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28197
+ Epoch[ 1 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.38874512; EvalErr[0]PerSample = 0.40781250; TotalTime = 0.09069s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28227
+ Epoch[ 1 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41911163; EvalErr[0]PerSample = 0.42539063; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28202
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39049683; EvalErr[0]PerSample = 0.42148438; TotalTime = 0.12864s; TotalTimePerSample = 0.05025ms; SamplesPerSecond = 19900
- Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.36727448; EvalErr[0]PerSample = 0.41054687; TotalTime = 0.12862s; TotalTimePerSample = 0.05024ms; SamplesPerSecond = 19903
- Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33726044; EvalErr[0]PerSample = 0.40703125; TotalTime = 0.15213s; TotalTimePerSample = 0.05943ms; SamplesPerSecond = 16827
- Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.40177307; EvalErr[0]PerSample = 0.40781250; TotalTime = 0.12857s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19910
- Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33615417; EvalErr[0]PerSample = 0.39570312; TotalTime = 0.12867s; TotalTimePerSample = 0.05026ms; SamplesPerSecond = 19895
- Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.34133606; EvalErr[0]PerSample = 0.40273437; TotalTime = 0.12841s; TotalTimePerSample = 0.05016ms; SamplesPerSecond = 19936
- Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26413574; EvalErr[0]PerSample = 0.37304688; TotalTime = 0.12802s; TotalTimePerSample = 0.05001ms; SamplesPerSecond = 19996
- Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28038635; EvalErr[0]PerSample = 0.38593750; TotalTime = 0.12841s; TotalTimePerSample = 0.05016ms; SamplesPerSecond = 19936
- Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.29767151; EvalErr[0]PerSample = 0.39179687; TotalTime = 0.16430s; TotalTimePerSample = 0.06418ms; SamplesPerSecond = 15581
- Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28023682; EvalErr[0]PerSample = 0.39687500; TotalTime = 0.16454s; TotalTimePerSample = 0.06427ms; SamplesPerSecond = 15558
- Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26818542; EvalErr[0]PerSample = 0.38945313; TotalTime = 0.16489s; TotalTimePerSample = 0.06441ms; SamplesPerSecond = 15525
- Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21394043; EvalErr[0]PerSample = 0.36250000; TotalTime = 0.16427s; TotalTimePerSample = 0.06417ms; SamplesPerSecond = 15583
- Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20627136; EvalErr[0]PerSample = 0.36953125; TotalTime = 0.16384s; TotalTimePerSample = 0.06400ms; SamplesPerSecond = 15624
- Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.25008850; EvalErr[0]PerSample = 0.37929687; TotalTime = 0.16415s; TotalTimePerSample = 0.06412ms; SamplesPerSecond = 15595
- Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22965393; EvalErr[0]PerSample = 0.37617187; TotalTime = 0.16463s; TotalTimePerSample = 0.06431ms; SamplesPerSecond = 15550
- Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15062561; EvalErr[0]PerSample = 0.34960938; TotalTime = 0.16421s; TotalTimePerSample = 0.06414ms; SamplesPerSecond = 15590
- Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16630554; EvalErr[0]PerSample = 0.35390625; TotalTime = 0.13011s; TotalTimePerSample = 0.05082ms; SamplesPerSecond = 19675
- Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22966309; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.12816s; TotalTimePerSample = 0.05006ms; SamplesPerSecond = 19975
- Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16364136; EvalErr[0]PerSample = 0.36445312; TotalTime = 0.12827s; TotalTimePerSample = 0.05010ms; SamplesPerSecond = 19958
- Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17280579; EvalErr[0]PerSample = 0.35351562; TotalTime = 0.12890s; TotalTimePerSample = 0.05035ms; SamplesPerSecond = 19860
- Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16119995; EvalErr[0]PerSample = 0.34687500; TotalTime = 0.12864s; TotalTimePerSample = 0.05025ms; SamplesPerSecond = 19901
- Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16999512; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.12263s; TotalTimePerSample = 0.04790ms; SamplesPerSecond = 20875
-Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.5028688; EvalErrPerSample = 0.42475587; Ave LearnRatePerSample = 0.003125000047; EpochTime=5.763236
+ Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.38730774; EvalErr[0]PerSample = 0.42148438; TotalTime = 0.09085s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28178
+ Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.36617889; EvalErr[0]PerSample = 0.41015625; TotalTime = 0.09080s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28194
+ Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33381653; EvalErr[0]PerSample = 0.40781250; TotalTime = 0.09084s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28180
+ Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39802246; EvalErr[0]PerSample = 0.40546875; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28189
+ Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33336182; EvalErr[0]PerSample = 0.40195313; TotalTime = 0.09078s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28200
+ Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33834229; EvalErr[0]PerSample = 0.40195313; TotalTime = 0.09078s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28199
+ Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26663208; EvalErr[0]PerSample = 0.37578125; TotalTime = 0.09074s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28211
+ Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28086243; EvalErr[0]PerSample = 0.39296875; TotalTime = 0.09072s; TotalTimePerSample = 0.03544ms; SamplesPerSecond = 28218
+ Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.29481506; EvalErr[0]PerSample = 0.39531250; TotalTime = 0.09076s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28207
+ Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27625122; EvalErr[0]PerSample = 0.39375000; TotalTime = 0.09079s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28196
+ Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26905518; EvalErr[0]PerSample = 0.38984375; TotalTime = 0.09070s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28223
+ Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21494751; EvalErr[0]PerSample = 0.36250000; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28204
+ Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20699158; EvalErr[0]PerSample = 0.36914062; TotalTime = 0.09076s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28207
+ Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.25002136; EvalErr[0]PerSample = 0.37851563; TotalTime = 0.09080s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28192
+ Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22617493; EvalErr[0]PerSample = 0.37656250; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28189
+ Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14840393; EvalErr[0]PerSample = 0.35468750; TotalTime = 0.09064s; TotalTimePerSample = 0.03541ms; SamplesPerSecond = 28242
+ Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16649780; EvalErr[0]PerSample = 0.35468750; TotalTime = 0.09076s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28206
+ Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22885742; EvalErr[0]PerSample = 0.36992188; TotalTime = 0.09083s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28185
+ Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16533203; EvalErr[0]PerSample = 0.36484375; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28190
+ Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17502136; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.09073s; TotalTimePerSample = 0.03544ms; SamplesPerSecond = 28215
+ Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16159058; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.09071s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28223
+ Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17113953; EvalErr[0]PerSample = 0.35429688; TotalTime = 0.08532s; TotalTimePerSample = 0.03333ms; SamplesPerSecond = 30005
+Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.4990798; EvalErrPerSample = 0.42547607; Ave LearnRatePerSample = 0.003125000047; EpochTime=3.754035
 Starting Epoch 2: learning rate per sample = 0.003125  effective momentum = 0.900000 
 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14169836; EvalErr[0]PerSample = 0.35156250; TotalTime = 0.12945s; TotalTimePerSample = 0.05057ms; SamplesPerSecond = 19775
- Epoch[ 2 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16675386; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.12838s; TotalTimePerSample = 0.05015ms; SamplesPerSecond = 19940
- Epoch[ 2 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23896408; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.12855s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19914
- Epoch[ 2 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17921028; EvalErr[0]PerSample = 0.36289063; TotalTime = 0.12850s; TotalTimePerSample = 0.05019ms; SamplesPerSecond = 19922
- Epoch[ 2 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13760986; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.12836s; TotalTimePerSample = 0.05014ms; SamplesPerSecond = 19943
- Epoch[ 2 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21572113; EvalErr[0]PerSample = 0.36601563; TotalTime = 0.12828s; TotalTimePerSample = 0.05011ms; SamplesPerSecond = 19956
- Epoch[ 2 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14051437; EvalErr[0]PerSample = 0.34140625; TotalTime = 0.13201s; TotalTimePerSample = 0.05157ms; SamplesPerSecond = 19392
- Epoch[ 2 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12286606; EvalErr[0]PerSample = 0.34492187; TotalTime = 0.16368s; TotalTimePerSample = 0.06394ms; SamplesPerSecond = 15640
- Epoch[ 2 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14243240; EvalErr[0]PerSample = 0.33789062; TotalTime = 0.16444s; TotalTimePerSample = 0.06424ms; SamplesPerSecond = 15567
- Epoch[ 2 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12677765; EvalErr[0]PerSample = 0.35390625; TotalTime = 0.16509s; TotalTimePerSample = 0.06449ms; SamplesPerSecond = 15506
+ Epoch[ 2 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14215412; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.09197s; TotalTimePerSample = 0.03593ms; SamplesPerSecond = 27835
+ Epoch[ 2 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17049236; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.09082s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28189
+ Epoch[ 2 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.24373856; EvalErr[0]PerSample = 0.37460938; TotalTime = 0.09080s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28193
+ Epoch[ 2 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18655586; EvalErr[0]PerSample = 0.36445312; TotalTime = 0.09076s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28204
+ Epoch[ 2 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13848000; EvalErr[0]PerSample = 0.35039063; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28202
+ Epoch[ 2 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21884232; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.09072s; TotalTimePerSample = 0.03544ms; SamplesPerSecond = 28219
+ Epoch[ 2 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14372940; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.09091s; TotalTimePerSample = 0.03551ms; SamplesPerSecond = 28160
+ Epoch[ 2 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12769089; EvalErr[0]PerSample = 0.34960938; TotalTime = 0.09067s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28235
+ Epoch[ 2 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14114227; EvalErr[0]PerSample = 0.33554688; TotalTime = 0.09074s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28212
+ Epoch[ 2 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12445145; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.09068s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28231
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14400177; EvalErr[0]PerSample = 0.33984375; TotalTime = 0.16439s; TotalTimePerSample = 0.06422ms; SamplesPerSecond = 15572
- Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12832870; EvalErr[0]PerSample = 0.34531250; TotalTime = 0.16414s; TotalTimePerSample = 0.06412ms; SamplesPerSecond = 15596
- Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11099091; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.16406s; TotalTimePerSample = 0.06409ms; SamplesPerSecond = 15603
- Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06680908; EvalErr[0]PerSample = 0.32304688; TotalTime = 0.16356s; TotalTimePerSample = 0.06389ms; SamplesPerSecond = 15652
- Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05362549; EvalErr[0]PerSample = 0.30859375; TotalTime = 0.16398s; TotalTimePerSample = 0.06405ms; SamplesPerSecond = 15611
- Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06292725; EvalErr[0]PerSample = 0.32734375; TotalTime = 0.16541s; TotalTimePerSample = 0.06461ms; SamplesPerSecond = 15476
- Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14273834; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.16545s; TotalTimePerSample = 0.06463ms; SamplesPerSecond = 15472
- Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14362183; EvalErr[0]PerSample = 0.35859375; TotalTime = 0.13098s; TotalTimePerSample = 0.05116ms; SamplesPerSecond = 19544
- Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08687897; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.12837s; TotalTimePerSample = 0.05014ms; SamplesPerSecond = 19942
- Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.07546844; EvalErr[0]PerSample = 0.33632812; TotalTime = 0.12850s; TotalTimePerSample = 0.05019ms; SamplesPerSecond = 19922
- Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06579132; EvalErr[0]PerSample = 0.32695313; TotalTime = 0.12887s; TotalTimePerSample = 0.05034ms; SamplesPerSecond = 19864
- Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09530640; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.12837s; TotalTimePerSample = 0.05014ms; SamplesPerSecond = 19942
- Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11944122; EvalErr[0]PerSample = 0.35117188; TotalTime = 0.12823s; TotalTimePerSample = 0.05009ms; SamplesPerSecond = 19963
- Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13388062; EvalErr[0]PerSample = 0.35507813; TotalTime = 0.12828s; TotalTimePerSample = 0.05011ms; SamplesPerSecond = 19955
- Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08914795; EvalErr[0]PerSample = 0.33437500; TotalTime = 0.12840s; TotalTimePerSample = 0.05016ms; SamplesPerSecond = 19937
- Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06987000; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.12851s; TotalTimePerSample = 0.05020ms; SamplesPerSecond = 19920
- Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06095581; EvalErr[0]PerSample = 0.32109375; TotalTime = 0.12857s; TotalTimePerSample = 0.05022ms; SamplesPerSecond = 19911
- Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09798889; EvalErr[0]PerSample = 0.33085938; TotalTime = 0.12868s; TotalTimePerSample = 0.05027ms; SamplesPerSecond = 19894
- Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.02103271; EvalErr[0]PerSample = 0.32890625; TotalTime = 0.12890s; TotalTimePerSample = 0.05035ms; SamplesPerSecond = 19860
- Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06984253; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.12823s; TotalTimePerSample = 0.05009ms; SamplesPerSecond = 19964
- Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06397095; EvalErr[0]PerSample = 0.32929687; TotalTime = 0.12842s; TotalTimePerSample = 0.05016ms; SamplesPerSecond = 19935
- Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05246582; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.12215s; TotalTimePerSample = 0.04772ms; SamplesPerSecond = 20957
-Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1114886; EvalErrPerSample = 0.34130859; Ave LearnRatePerSample = 0.003125000047; EpochTime=4.486329
+ Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14137955; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.09082s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28186
+ Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12705154; EvalErr[0]PerSample = 0.33867188; TotalTime = 0.09065s; TotalTimePerSample = 0.03541ms; SamplesPerSecond = 28241
+ Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10779419; EvalErr[0]PerSample = 0.34531250; TotalTime = 0.09078s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28200
+ Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.07003021; EvalErr[0]PerSample = 0.32500000; TotalTime = 0.09075s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28210
+ Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05308990; EvalErr[0]PerSample = 0.31406250; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28202
+ Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06392975; EvalErr[0]PerSample = 0.33085938; TotalTime = 0.09070s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28224
+ Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14430847; EvalErr[0]PerSample = 0.35507813; TotalTime = 0.09084s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28180
+ Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14809570; EvalErr[0]PerSample = 0.35859375; TotalTime = 0.09087s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28173
+ Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08184509; EvalErr[0]PerSample = 0.33515625; TotalTime = 0.09083s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28185
+ Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.07637024; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28189
+ Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06249695; EvalErr[0]PerSample = 0.32500000; TotalTime = 0.09063s; TotalTimePerSample = 0.03540ms; SamplesPerSecond = 28247
+ Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09361877; EvalErr[0]PerSample = 0.33320312; TotalTime = 0.09059s; TotalTimePerSample = 0.03539ms; SamplesPerSecond = 28257
+ Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12118530; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.09067s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28233
+ Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13457642; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.09075s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28209
+ Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09024963; EvalErr[0]PerSample = 0.33984375; TotalTime = 0.09067s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28235
+ Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.07457275; EvalErr[0]PerSample = 0.33164063; TotalTime = 0.09070s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28225
+ Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05975952; EvalErr[0]PerSample = 0.32070312; TotalTime = 0.09076s; TotalTimePerSample = 0.03545ms; SamplesPerSecond = 28205
+ Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09778137; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.09082s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28186
+ Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.01963196; EvalErr[0]PerSample = 0.32539062; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28190
+ Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.07533875; EvalErr[0]PerSample = 0.33515625; TotalTime = 0.09069s; TotalTimePerSample = 0.03542ms; SamplesPerSecond = 28228
+ Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06417236; EvalErr[0]PerSample = 0.33007812; TotalTime = 0.09071s; TotalTimePerSample = 0.03543ms; SamplesPerSecond = 28221
+ Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.04990234; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.08542s; TotalTimePerSample = 0.03337ms; SamplesPerSecond = 29970
+Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1123269; EvalErrPerSample = 0.34179688; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.909345
 CNTKCommandTrainEnd: DPT_Pre2
 
 
@@ -1981,6 +2282,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2032,7 +2356,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL2.W = LearnableParameter -> [512, 512]
@@ -2108,6 +2432,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2159,7 +2507,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 9 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2232,6 +2580,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2315,7 +2686,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node cr. 3 nodes to process in pass 2.
+Validating for node cr. 12 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2403,6 +2774,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2464,6 +2863,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2526,6 +2954,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2587,6 +3044,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2647,6 +3132,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2686,7 +3199,7 @@ htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/gl
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
-Starting from checkpoint. Load Network From File /tmp/cntk-test-20151012184916.181476/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0.
+Starting from checkpoint. Load Network From File /tmp/cntk-test-20151024125156.761111/Speech/DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0.
 
 
 Printing Gradient Computation Node Order ... 
@@ -2833,6 +3346,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2894,7 +3435,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL3.W = LearnableParameter -> [512, 512]
@@ -2985,6 +3526,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -3046,7 +3616,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 12 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -3134,6 +3704,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -3173,105 +3771,105 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
 Starting minibatch loop.
- Epoch[ 1 of 4]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  3.96939201; EvalErr[0]PerSample = 0.81250000; TotalTime = 0.20433s; TotalTimePerSample = 0.07982ms; SamplesPerSecond = 12528
- Epoch[ 1 of 4]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.64767342; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.19765s; TotalTimePerSample = 0.07721ms; SamplesPerSecond = 12952
- Epoch[ 1 of 4]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.02707901; EvalErr[0]PerSample = 0.53867188; TotalTime = 0.20620s; TotalTimePerSample = 0.08055ms; SamplesPerSecond = 12415
- Epoch[ 1 of 4]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.74281921; EvalErr[0]PerSample = 0.47343750; TotalTime = 0.19865s; TotalTimePerSample = 0.07760ms; SamplesPerSecond = 12886
- Epoch[ 1 of 4]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.58044128; EvalErr[0]PerSample = 0.45156250; TotalTime = 0.19802s; TotalTimePerSample = 0.07735ms; SamplesPerSecond = 12928
- Epoch[ 1 of 4]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.47565231; EvalErr[0]PerSample = 0.41757813; TotalTime = 0.19878s; TotalTimePerSample = 0.07765ms; SamplesPerSecond = 12878
- Epoch[ 1 of 4]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.43280945; EvalErr[0]PerSample = 0.41132812; TotalTime = 0.17110s; TotalTimePerSample = 0.06684ms; SamplesPerSecond = 14961
- Epoch[ 1 of 4]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.35942993; EvalErr[0]PerSample = 0.39531250; TotalTime = 0.16138s; TotalTimePerSample = 0.06304ms; SamplesPerSecond = 15862
- Epoch[ 1 of 4]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28088837; EvalErr[0]PerSample = 0.37812500; TotalTime = 0.16122s; TotalTimePerSample = 0.06298ms; SamplesPerSecond = 15879
- Epoch[ 1 of 4]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.29705811; EvalErr[0]PerSample = 0.39570312; TotalTime = 0.16121s; TotalTimePerSample = 0.06297ms; SamplesPerSecond = 15879
+ Epoch[ 1 of 4]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  3.97086334; EvalErr[0]PerSample = 0.81445312; TotalTime = 0.11658s; TotalTimePerSample = 0.04554ms; SamplesPerSecond = 21959
+ Epoch[ 1 of 4]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.63975830; EvalErr[0]PerSample = 0.63320312; TotalTime = 0.11338s; TotalTimePerSample = 0.04429ms; SamplesPerSecond = 22579
+ Epoch[ 1 of 4]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.02565231; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.11354s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22546
+ Epoch[ 1 of 4]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.74204865; EvalErr[0]PerSample = 0.47500000; TotalTime = 0.11328s; TotalTimePerSample = 0.04425ms; SamplesPerSecond = 22599
+ Epoch[ 1 of 4]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.58343964; EvalErr[0]PerSample = 0.45156250; TotalTime = 0.11348s; TotalTimePerSample = 0.04433ms; SamplesPerSecond = 22559
+ Epoch[ 1 of 4]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.47893143; EvalErr[0]PerSample = 0.42343750; TotalTime = 0.11351s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22553
+ Epoch[ 1 of 4]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.43405457; EvalErr[0]PerSample = 0.40898438; TotalTime = 0.11369s; TotalTimePerSample = 0.04441ms; SamplesPerSecond = 22517
+ Epoch[ 1 of 4]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.35973663; EvalErr[0]PerSample = 0.39648438; TotalTime = 0.11353s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22548
+ Epoch[ 1 of 4]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28108978; EvalErr[0]PerSample = 0.37968750; TotalTime = 0.11343s; TotalTimePerSample = 0.04431ms; SamplesPerSecond = 22568
+ Epoch[ 1 of 4]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.29773560; EvalErr[0]PerSample = 0.39765625; TotalTime = 0.11329s; TotalTimePerSample = 0.04426ms; SamplesPerSecond = 22596
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 1 of 4]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28361969; EvalErr[0]PerSample = 0.39101562; TotalTime = 0.16147s; TotalTimePerSample = 0.06308ms; SamplesPerSecond = 15853
- Epoch[ 1 of 4]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27552490; EvalErr[0]PerSample = 0.38515625; TotalTime = 0.16204s; TotalTimePerSample = 0.06330ms; SamplesPerSecond = 15798
- Epoch[ 1 of 4]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23978882; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.16103s; TotalTimePerSample = 0.06290ms; SamplesPerSecond = 15897
- Epoch[ 1 of 4]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.31328888; EvalErr[0]PerSample = 0.38593750; TotalTime = 0.16089s; TotalTimePerSample = 0.06285ms; SamplesPerSecond = 15911
- Epoch[ 1 of 4]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.25646362; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.18754s; TotalTimePerSample = 0.07326ms; SamplesPerSecond = 13650
- Epoch[ 1 of 4]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27446442; EvalErr[0]PerSample = 0.38398437; TotalTime = 0.19911s; TotalTimePerSample = 0.07778ms; SamplesPerSecond = 12857
- Epoch[ 1 of 4]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20181580; EvalErr[0]PerSample = 0.36289063; TotalTime = 0.19988s; TotalTimePerSample = 0.07808ms; SamplesPerSecond = 12807
- Epoch[ 1 of 4]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20729980; EvalErr[0]PerSample = 0.36796875; TotalTime = 0.19928s; TotalTimePerSample = 0.07784ms; SamplesPerSecond = 12846
- Epoch[ 1 of 4]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20639648; EvalErr[0]PerSample = 0.36914062; TotalTime = 0.19860s; TotalTimePerSample = 0.07758ms; SamplesPerSecond = 12890
- Epoch[ 1 of 4]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20577698; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.17294s; TotalTimePerSample = 0.06755ms; SamplesPerSecond = 14803
- Epoch[ 1 of 4]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20345459; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.16089s; TotalTimePerSample = 0.06285ms; SamplesPerSecond = 15911
- Epoch[ 1 of 4]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14157104; EvalErr[0]PerSample = 0.34609375; TotalTime = 0.16185s; TotalTimePerSample = 0.06322ms; SamplesPerSecond = 15817
- Epoch[ 1 of 4]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14772339; EvalErr[0]PerSample = 0.35351562; TotalTime = 0.16116s; TotalTimePerSample = 0.06295ms; SamplesPerSecond = 15884
- Epoch[ 1 of 4]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19301453; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.16144s; TotalTimePerSample = 0.06306ms; SamplesPerSecond = 15857
- Epoch[ 1 of 4]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16928101; EvalErr[0]PerSample = 0.36406250; TotalTime = 0.16115s; TotalTimePerSample = 0.06295ms; SamplesPerSecond = 15885
- Epoch[ 1 of 4]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08552246; EvalErr[0]PerSample = 0.34062500; TotalTime = 0.16084s; TotalTimePerSample = 0.06283ms; SamplesPerSecond = 15916
- Epoch[ 1 of 4]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11441040; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.20073s; TotalTimePerSample = 0.07841ms; SamplesPerSecond = 12753
- Epoch[ 1 of 4]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17764893; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.20118s; TotalTimePerSample = 0.07859ms; SamplesPerSecond = 12724
- Epoch[ 1 of 4]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11296692; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.19897s; TotalTimePerSample = 0.07772ms; SamplesPerSecond = 12866
- Epoch[ 1 of 4]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13165283; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.19846s; TotalTimePerSample = 0.07752ms; SamplesPerSecond = 12899
- Epoch[ 1 of 4]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12458191; EvalErr[0]PerSample = 0.34570312; TotalTime = 0.19971s; TotalTimePerSample = 0.07801ms; SamplesPerSecond = 12818
- Epoch[ 1 of 4]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12154541; EvalErr[0]PerSample = 0.33906250; TotalTime = 0.19018s; TotalTimePerSample = 0.07429ms; SamplesPerSecond = 13461
-Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 1.406283; EvalErrPerSample = 0.40246582; Ave LearnRatePerSample = 0.003125000047; EpochTime=7.080416
+ Epoch[ 1 of 4]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.28441925; EvalErr[0]PerSample = 0.39062500; TotalTime = 0.11343s; TotalTimePerSample = 0.04431ms; SamplesPerSecond = 22567
+ Epoch[ 1 of 4]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27777252; EvalErr[0]PerSample = 0.38164063; TotalTime = 0.11341s; TotalTimePerSample = 0.04430ms; SamplesPerSecond = 22573
+ Epoch[ 1 of 4]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23615112; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.11341s; TotalTimePerSample = 0.04430ms; SamplesPerSecond = 22573
+ Epoch[ 1 of 4]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.31171112; EvalErr[0]PerSample = 0.38671875; TotalTime = 0.11351s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22552
+ Epoch[ 1 of 4]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.25573883; EvalErr[0]PerSample = 0.37773438; TotalTime = 0.11337s; TotalTimePerSample = 0.04429ms; SamplesPerSecond = 22580
+ Epoch[ 1 of 4]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27382965; EvalErr[0]PerSample = 0.38398437; TotalTime = 0.11349s; TotalTimePerSample = 0.04433ms; SamplesPerSecond = 22556
+ Epoch[ 1 of 4]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20634155; EvalErr[0]PerSample = 0.36406250; TotalTime = 0.11336s; TotalTimePerSample = 0.04428ms; SamplesPerSecond = 22582
+ Epoch[ 1 of 4]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20973816; EvalErr[0]PerSample = 0.36562500; TotalTime = 0.11355s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22546
+ Epoch[ 1 of 4]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20688782; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.11352s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22550
+ Epoch[ 1 of 4]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20260315; EvalErr[0]PerSample = 0.37226562; TotalTime = 0.11337s; TotalTimePerSample = 0.04429ms; SamplesPerSecond = 22580
+ Epoch[ 1 of 4]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20553894; EvalErr[0]PerSample = 0.37187500; TotalTime = 0.11352s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22551
+ Epoch[ 1 of 4]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14160156; EvalErr[0]PerSample = 0.34726563; TotalTime = 0.11316s; TotalTimePerSample = 0.04420ms; SamplesPerSecond = 22623
+ Epoch[ 1 of 4]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15316467; EvalErr[0]PerSample = 0.35273437; TotalTime = 0.11336s; TotalTimePerSample = 0.04428ms; SamplesPerSecond = 22583
+ Epoch[ 1 of 4]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19352417; EvalErr[0]PerSample = 0.35468750; TotalTime = 0.11343s; TotalTimePerSample = 0.04431ms; SamplesPerSecond = 22568
+ Epoch[ 1 of 4]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17192078; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.11335s; TotalTimePerSample = 0.04428ms; SamplesPerSecond = 22584
+ Epoch[ 1 of 4]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08281555; EvalErr[0]PerSample = 0.33867188; TotalTime = 0.11366s; TotalTimePerSample = 0.04440ms; SamplesPerSecond = 22522
+ Epoch[ 1 of 4]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11028442; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.11344s; TotalTimePerSample = 0.04431ms; SamplesPerSecond = 22567
+ Epoch[ 1 of 4]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17454224; EvalErr[0]PerSample = 0.35312500; TotalTime = 0.11337s; TotalTimePerSample = 0.04428ms; SamplesPerSecond = 22581
+ Epoch[ 1 of 4]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11068115; EvalErr[0]PerSample = 0.34531250; TotalTime = 0.11339s; TotalTimePerSample = 0.04429ms; SamplesPerSecond = 22577
+ Epoch[ 1 of 4]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12955627; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.11348s; TotalTimePerSample = 0.04433ms; SamplesPerSecond = 22559
+ Epoch[ 1 of 4]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12482300; EvalErr[0]PerSample = 0.34570312; TotalTime = 0.11350s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22554
+ Epoch[ 1 of 4]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12771912; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.10801s; TotalTimePerSample = 0.04219ms; SamplesPerSecond = 23701
+Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 1.4063962; EvalErrPerSample = 0.40274659; Ave LearnRatePerSample = 0.003125000047; EpochTime=4.485052
 Starting Epoch 2: learning rate per sample = 0.003125  effective momentum = 0.810210 
 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.49601746; EvalErr[0]PerSample = 0.41562500; TotalTime = 0.23368s; TotalTimePerSample = 0.04564ms; SamplesPerSecond = 21910
- Epoch[ 2 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.33961754; EvalErr[0]PerSample = 0.39316406; TotalTime = 0.22738s; TotalTimePerSample = 0.04441ms; SamplesPerSecond = 22516
- Epoch[ 2 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.19400368; EvalErr[0]PerSample = 0.36679688; TotalTime = 0.26154s; TotalTimePerSample = 0.05108ms; SamplesPerSecond = 19576
- Epoch[ 2 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.11921158; EvalErr[0]PerSample = 0.34023437; TotalTime = 0.29191s; TotalTimePerSample = 0.05701ms; SamplesPerSecond = 17539
- Epoch[ 2 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.12285690; EvalErr[0]PerSample = 0.34140625; TotalTime = 0.29226s; TotalTimePerSample = 0.05708ms; SamplesPerSecond = 17518
- Epoch[ 2 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.13342743; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.27248s; TotalTimePerSample = 0.05322ms; SamplesPerSecond = 18790
- Epoch[ 2 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08950500; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.22818s; TotalTimePerSample = 0.04457ms; SamplesPerSecond = 22438
- Epoch[ 2 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06079788; EvalErr[0]PerSample = 0.32363281; TotalTime = 0.22761s; TotalTimePerSample = 0.04446ms; SamplesPerSecond = 22494
- Epoch[ 2 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.18579025; EvalErr[0]PerSample = 0.36933594; TotalTime = 0.22761s; TotalTimePerSample = 0.04446ms; SamplesPerSecond = 22494
- Epoch[ 2 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08288193; EvalErr[0]PerSample = 0.34140625; TotalTime = 0.22663s; TotalTimePerSample = 0.04426ms; SamplesPerSecond = 22591
+ Epoch[ 2 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.51739788; EvalErr[0]PerSample = 0.41425781; TotalTime = 0.17081s; TotalTimePerSample = 0.03336ms; SamplesPerSecond = 29974
+ Epoch[ 2 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.25793457; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.16557s; TotalTimePerSample = 0.03234ms; SamplesPerSecond = 30923
+ Epoch[ 2 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.18638287; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.16542s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30950
+ Epoch[ 2 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.12794571; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.16542s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30950
+ Epoch[ 2 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.14070625; EvalErr[0]PerSample = 0.34570312; TotalTime = 0.16550s; TotalTimePerSample = 0.03232ms; SamplesPerSecond = 30936
+ Epoch[ 2 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.14582825; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.16544s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30948
+ Epoch[ 2 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.11193542; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.16536s; TotalTimePerSample = 0.03230ms; SamplesPerSecond = 30963
+ Epoch[ 2 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08574600; EvalErr[0]PerSample = 0.33789062; TotalTime = 0.16542s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30951
+ Epoch[ 2 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.21058884; EvalErr[0]PerSample = 0.37363281; TotalTime = 0.16562s; TotalTimePerSample = 0.03235ms; SamplesPerSecond = 30914
+ Epoch[ 2 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09668579; EvalErr[0]PerSample = 0.34335938; TotalTime = 0.16571s; TotalTimePerSample = 0.03237ms; SamplesPerSecond = 30897
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 2 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05400925; EvalErr[0]PerSample = 0.32578125; TotalTime = 0.25990s; TotalTimePerSample = 0.05076ms; SamplesPerSecond = 19700
- Epoch[ 2 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.14049835; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.29239s; TotalTimePerSample = 0.05711ms; SamplesPerSecond = 17510
- Epoch[ 2 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.11492462; EvalErr[0]PerSample = 0.34648438; TotalTime = 0.29289s; TotalTimePerSample = 0.05720ms; SamplesPerSecond = 17481
- Epoch[ 2 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07589722; EvalErr[0]PerSample = 0.32265625; TotalTime = 0.29237s; TotalTimePerSample = 0.05710ms; SamplesPerSecond = 17512
- Epoch[ 2 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04273682; EvalErr[0]PerSample = 0.32871094; TotalTime = 0.28067s; TotalTimePerSample = 0.05482ms; SamplesPerSecond = 18242
- Epoch[ 2 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05914001; EvalErr[0]PerSample = 0.32421875; TotalTime = 0.25271s; TotalTimePerSample = 0.04936ms; SamplesPerSecond = 20260
-Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 1.1444572; EvalErrPerSample = 0.34843752; Ave LearnRatePerSample = 0.003125000047; EpochTime=4.181761
+ Epoch[ 2 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05844955; EvalErr[0]PerSample = 0.32675781; TotalTime = 0.16548s; TotalTimePerSample = 0.03232ms; SamplesPerSecond = 30940
+ Epoch[ 2 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.10728455; EvalErr[0]PerSample = 0.34726563; TotalTime = 0.16561s; TotalTimePerSample = 0.03235ms; SamplesPerSecond = 30916
+ Epoch[ 2 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08716888; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.16526s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30981
+ Epoch[ 2 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06779022; EvalErr[0]PerSample = 0.31855469; TotalTime = 0.16545s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30946
+ Epoch[ 2 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04079590; EvalErr[0]PerSample = 0.32910156; TotalTime = 0.16529s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30974
+ Epoch[ 2 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06249695; EvalErr[0]PerSample = 0.32968750; TotalTime = 0.15482s; TotalTimePerSample = 0.03024ms; SamplesPerSecond = 33071
+Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 1.1440711; EvalErrPerSample = 0.34866944; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.658179
 Starting Epoch 3: learning rate per sample = 0.003125  effective momentum = 0.810210 
 minibatchiterator: epoch 2: frames [163840..245760] (first utterance at frame 163840), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 3 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.11066093; EvalErr[0]PerSample = 0.33886719; TotalTime = 0.22848s; TotalTimePerSample = 0.04462ms; SamplesPerSecond = 22409
- Epoch[ 3 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.10548515; EvalErr[0]PerSample = 0.34511719; TotalTime = 0.22788s; TotalTimePerSample = 0.04451ms; SamplesPerSecond = 22468
- Epoch[ 3 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.10001144; EvalErr[0]PerSample = 0.34902344; TotalTime = 0.22845s; TotalTimePerSample = 0.04462ms; SamplesPerSecond = 22411
- Epoch[ 3 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.12368736; EvalErr[0]PerSample = 0.33847656; TotalTime = 0.22749s; TotalTimePerSample = 0.04443ms; SamplesPerSecond = 22506
- Epoch[ 3 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.12565804; EvalErr[0]PerSample = 0.34316406; TotalTime = 0.22824s; TotalTimePerSample = 0.04458ms; SamplesPerSecond = 22432
- Epoch[ 3 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08098526; EvalErr[0]PerSample = 0.33652344; TotalTime = 0.25245s; TotalTimePerSample = 0.04931ms; SamplesPerSecond = 20281
- Epoch[ 3 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09546432; EvalErr[0]PerSample = 0.33964844; TotalTime = 0.29113s; TotalTimePerSample = 0.05686ms; SamplesPerSecond = 17586
- Epoch[ 3 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07909393; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.29145s; TotalTimePerSample = 0.05692ms; SamplesPerSecond = 17567
- Epoch[ 3 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02718582; EvalErr[0]PerSample = 0.31562500; TotalTime = 0.29116s; TotalTimePerSample = 0.05687ms; SamplesPerSecond = 17584
- Epoch[ 3 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04374771; EvalErr[0]PerSample = 0.31953125; TotalTime = 0.28709s; TotalTimePerSample = 0.05607ms; SamplesPerSecond = 17834
+ Epoch[ 3 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.11238871; EvalErr[0]PerSample = 0.34804687; TotalTime = 0.16758s; TotalTimePerSample = 0.03273ms; SamplesPerSecond = 30552
+ Epoch[ 3 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09456167; EvalErr[0]PerSample = 0.34121094; TotalTime = 0.16526s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30982
+ Epoch[ 3 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.10800095; EvalErr[0]PerSample = 0.34667969; TotalTime = 0.16558s; TotalTimePerSample = 0.03234ms; SamplesPerSecond = 30921
+ Epoch[ 3 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.16617966; EvalErr[0]PerSample = 0.35566406; TotalTime = 0.16543s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30949
+ Epoch[ 3 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.14173546; EvalErr[0]PerSample = 0.34550781; TotalTime = 0.16551s; TotalTimePerSample = 0.03233ms; SamplesPerSecond = 30935
+ Epoch[ 3 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07876015; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.16532s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30970
+ Epoch[ 3 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08043213; EvalErr[0]PerSample = 0.33437500; TotalTime = 0.16507s; TotalTimePerSample = 0.03224ms; SamplesPerSecond = 31017
+ Epoch[ 3 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07423630; EvalErr[0]PerSample = 0.33007812; TotalTime = 0.16543s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30948
+ Epoch[ 3 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02659454; EvalErr[0]PerSample = 0.31113281; TotalTime = 0.16533s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30967
+ Epoch[ 3 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04602737; EvalErr[0]PerSample = 0.31855469; TotalTime = 0.16517s; TotalTimePerSample = 0.03226ms; SamplesPerSecond = 30997
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 3 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05229645; EvalErr[0]PerSample = 0.33457031; TotalTime = 0.29182s; TotalTimePerSample = 0.05700ms; SamplesPerSecond = 17545
- Epoch[ 3 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08028870; EvalErr[0]PerSample = 0.33769531; TotalTime = 0.23230s; TotalTimePerSample = 0.04537ms; SamplesPerSecond = 22040
- Epoch[ 3 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05386963; EvalErr[0]PerSample = 0.31933594; TotalTime = 0.22718s; TotalTimePerSample = 0.04437ms; SamplesPerSecond = 22536
- Epoch[ 3 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02473297; EvalErr[0]PerSample = 0.32167969; TotalTime = 0.22772s; TotalTimePerSample = 0.04448ms; SamplesPerSecond = 22483
- Epoch[ 3 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04503784; EvalErr[0]PerSample = 0.33085938; TotalTime = 0.22719s; TotalTimePerSample = 0.04437ms; SamplesPerSecond = 22536
- Epoch[ 3 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01943665; EvalErr[0]PerSample = 0.32050781; TotalTime = 0.21543s; TotalTimePerSample = 0.04208ms; SamplesPerSecond = 23766
-Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 1.0729777; EvalErrPerSample = 0.33269045; Ave LearnRatePerSample = 0.003125000047; EpochTime=3.995354
+ Epoch[ 3 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05524902; EvalErr[0]PerSample = 0.33613281; TotalTime = 0.16554s; TotalTimePerSample = 0.03233ms; SamplesPerSecond = 30928
+ Epoch[ 3 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07627411; EvalErr[0]PerSample = 0.33613281; TotalTime = 0.16533s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30967
+ Epoch[ 3 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05101776; EvalErr[0]PerSample = 0.31660156; TotalTime = 0.16521s; TotalTimePerSample = 0.03227ms; SamplesPerSecond = 30991
+ Epoch[ 3 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.03016815; EvalErr[0]PerSample = 0.32480469; TotalTime = 0.16532s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30970
+ Epoch[ 3 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04644623; EvalErr[0]PerSample = 0.32929687; TotalTime = 0.16540s; TotalTimePerSample = 0.03230ms; SamplesPerSecond = 30956
+ Epoch[ 3 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02751465; EvalErr[0]PerSample = 0.32265625; TotalTime = 0.15429s; TotalTimePerSample = 0.03013ms; SamplesPerSecond = 33185
+Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 1.0759742; EvalErrPerSample = 0.33315429; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.652503
 Starting Epoch 4: learning rate per sample = 0.003125  effective momentum = 0.810210 
 minibatchiterator: epoch 3: frames [245760..327680] (first utterance at frame 245760), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 4 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02822218; EvalErr[0]PerSample = 0.31328125; TotalTime = 0.22742s; TotalTimePerSample = 0.04442ms; SamplesPerSecond = 22513
- Epoch[ 4 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 4926; TrainLossPerSample =  1.04848684; EvalErr[0]PerSample = 0.32967925; TotalTime = 0.51921s; TotalTimePerSample = 0.10540ms; SamplesPerSecond = 9487
- Epoch[ 4 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01644306; EvalErr[0]PerSample = 0.32148437; TotalTime = 0.29076s; TotalTimePerSample = 0.05679ms; SamplesPerSecond = 17608
- Epoch[ 4 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99039593; EvalErr[0]PerSample = 0.31425781; TotalTime = 0.29139s; TotalTimePerSample = 0.05691ms; SamplesPerSecond = 17570
- Epoch[ 4 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99446030; EvalErr[0]PerSample = 0.31562500; TotalTime = 0.29187s; TotalTimePerSample = 0.05701ms; SamplesPerSecond = 17541
- Epoch[ 4 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00650482; EvalErr[0]PerSample = 0.32382813; TotalTime = 0.29209s; TotalTimePerSample = 0.05705ms; SamplesPerSecond = 17528
- Epoch[ 4 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02417755; EvalErr[0]PerSample = 0.32031250; TotalTime = 0.29194s; TotalTimePerSample = 0.05702ms; SamplesPerSecond = 17537
- Epoch[ 4 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01169128; EvalErr[0]PerSample = 0.31855469; TotalTime = 0.24643s; TotalTimePerSample = 0.04813ms; SamplesPerSecond = 20776
- Epoch[ 4 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99888992; EvalErr[0]PerSample = 0.30937500; TotalTime = 0.22709s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22546
- Epoch[ 4 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00544128; EvalErr[0]PerSample = 0.31113281; TotalTime = 0.22708s; TotalTimePerSample = 0.04435ms; SamplesPerSecond = 22547
+ Epoch[ 4 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.03003817; EvalErr[0]PerSample = 0.31289062; TotalTime = 0.16620s; TotalTimePerSample = 0.03246ms; SamplesPerSecond = 30805
+ Epoch[ 4 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 4926; TrainLossPerSample =  1.04547925; EvalErr[0]PerSample = 0.32947625; TotalTime = 0.37782s; TotalTimePerSample = 0.07670ms; SamplesPerSecond = 13037
+ Epoch[ 4 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01249599; EvalErr[0]PerSample = 0.32246094; TotalTime = 0.16536s; TotalTimePerSample = 0.03230ms; SamplesPerSecond = 30962
+ Epoch[ 4 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99796467; EvalErr[0]PerSample = 0.31425781; TotalTime = 0.16531s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30972
+ Epoch[ 4 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99781761; EvalErr[0]PerSample = 0.31464844; TotalTime = 0.16525s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30983
+ Epoch[ 4 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00107079; EvalErr[0]PerSample = 0.31855469; TotalTime = 0.16515s; TotalTimePerSample = 0.03226ms; SamplesPerSecond = 31002
+ Epoch[ 4 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02518806; EvalErr[0]PerSample = 0.31972656; TotalTime = 0.16521s; TotalTimePerSample = 0.03227ms; SamplesPerSecond = 30990
+ Epoch[ 4 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00891876; EvalErr[0]PerSample = 0.31660156; TotalTime = 0.16531s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30972
+ Epoch[ 4 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99774780; EvalErr[0]PerSample = 0.30585937; TotalTime = 0.16522s; TotalTimePerSample = 0.03227ms; SamplesPerSecond = 30989
+ Epoch[ 4 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00037842; EvalErr[0]PerSample = 0.30722656; TotalTime = 0.16522s; TotalTimePerSample = 0.03227ms; SamplesPerSecond = 30989
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 4 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01687851; EvalErr[0]PerSample = 0.31093750; TotalTime = 0.22702s; TotalTimePerSample = 0.04434ms; SamplesPerSecond = 22553
- Epoch[ 4 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.03951569; EvalErr[0]PerSample = 0.32851562; TotalTime = 0.26397s; TotalTimePerSample = 0.05156ms; SamplesPerSecond = 19396
- Epoch[ 4 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.98455429; EvalErr[0]PerSample = 0.30234375; TotalTime = 0.28984s; TotalTimePerSample = 0.05661ms; SamplesPerSecond = 17664
- Epoch[ 4 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.96297150; EvalErr[0]PerSample = 0.30136719; TotalTime = 0.29115s; TotalTimePerSample = 0.05687ms; SamplesPerSecond = 17585
- Epoch[ 4 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.98015137; EvalErr[0]PerSample = 0.31054688; TotalTime = 0.29163s; TotalTimePerSample = 0.05696ms; SamplesPerSecond = 17556
- Epoch[ 4 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.97653656; EvalErr[0]PerSample = 0.29863281; TotalTime = 0.27506s; TotalTimePerSample = 0.05372ms; SamplesPerSecond = 18614
-Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 1.0051814; EvalErrPerSample = 0.31445312; Ave LearnRatePerSample = 0.003125000047; EpochTime=4.579516
+ Epoch[ 4 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02586746; EvalErr[0]PerSample = 0.31816406; TotalTime = 0.16529s; TotalTimePerSample = 0.03228ms; SamplesPerSecond = 30975
+ Epoch[ 4 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06024628; EvalErr[0]PerSample = 0.33574219; TotalTime = 0.16542s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30952
+ Epoch[ 4 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.98301010; EvalErr[0]PerSample = 0.30214844; TotalTime = 0.16545s; TotalTimePerSample = 0.03231ms; SamplesPerSecond = 30946
+ Epoch[ 4 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.96488800; EvalErr[0]PerSample = 0.30156250; TotalTime = 0.16533s; TotalTimePerSample = 0.03229ms; SamplesPerSecond = 30968
+ Epoch[ 4 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99069977; EvalErr[0]PerSample = 0.31640625; TotalTime = 0.16536s; TotalTimePerSample = 0.03230ms; SamplesPerSecond = 30963
+ Epoch[ 4 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.97961731; EvalErr[0]PerSample = 0.29921875; TotalTime = 0.15761s; TotalTimePerSample = 0.03078ms; SamplesPerSecond = 32486
+Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 1.0073979; EvalErrPerSample = 0.31477052; Ave LearnRatePerSample = 0.003125000047; EpochTime=2.874394
 CNTKCommandTrainEnd: speechTrain
 COMPLETED
diff --git a/Tests/Speech/DNN/DiscriminativePreTraining/baseline.windows.gpu.txt b/Tests/Speech/DNN/DiscriminativePreTraining/baseline.windows.gpu.txt
index cbad133ab..9216dfc05 100644
--- a/Tests/Speech/DNN/DiscriminativePreTraining/baseline.windows.gpu.txt
+++ b/Tests/Speech/DNN/DiscriminativePreTraining/baseline.windows.gpu.txt
@@ -1,22 +1,22 @@
-=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining\cntk_dpt.config RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data DeviceId=0
+=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/cntk_dpt.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining DeviceId=0
 -------------------------------------------------------------------
 Build info: 
 
-		Built time: Oct 12 2015 17:58:56
-		Last modified date: Sat Oct 10 19:47:14 2015
+		Built time: Oct 24 2015 13:33:25
+		Last modified date: Thu Oct 22 16:00:27 2015
 		Built by amitaga on Amitaga-Win-DT3           
 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
 -------------------------------------------------------------------
-running on Amitaga-Win-DT3 at 2015/10/13 02:34:55
-command line options: 
-configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining\cntk_dpt.config RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data DeviceId=0 
+running on Amitaga-Win-DT3 at 2015/10/24 22:09:53
+command line: 
+E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/cntk_dpt.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining DeviceId=0 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
 deviceId=$DeviceId$
 command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain
-ndlMacros=$DataDir$/ndl/macros.txt
+ndlMacros=$ConfigDir$/macros.txt
 GlobalMean=GlobalStats/mean.363
 GlobalInvStd=GlobalStats/var.363
 GlobalPrior=GlobalStats/prior.132
@@ -34,7 +34,7 @@ DPT_Pre1=[
     action=train
     modelPath=$RunDir$/models/Pre1/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=$DataDir$/ndl/dnn_1layer.txt
+        networkDescription=$ConfigDir$/dnn_1layer.txt
     ]
 ]
 AddLayer2=[    
@@ -43,13 +43,13 @@ AddLayer2=[
     NewLayer=2
     CurrModel=$RunDir$/models/Pre1/cntkSpeech
     NewModel=$RunDir$/models/Pre2/cntkSpeech.0
-    editPath=$DataDir$/ndl/add_layer.mel
+    editPath=$ConfigDir$/add_layer.mel
 ]
 DPT_Pre2=[
     action=train
     modelPath=$RunDir$/models/Pre2/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=$DataDir$/ndl/dnn_1layer.txt
+        networkDescription=$ConfigDir$/dnn_1layer.txt
     ]
 ]
 AddLayer3=[    
@@ -58,7 +58,7 @@ AddLayer3=[
     NewLayer=3
     CurrModel=$RunDir$/models/Pre2/cntkSpeech
     NewModel=$RunDir$/models/cntkSpeech.0
-    editPath=$DataDir$/ndl/add_layer.mel
+    editPath=$ConfigDir$/add_layer.mel
 ]
 speechTrain=[
     action=train
@@ -66,7 +66,7 @@ speechTrain=[
     deviceId=$DeviceId$
     traceLevel=1
      NDLNetworkBuilder=[
-        networkDescription=$DataDir$/ndl/dnn.txt
+        networkDescription=$ConfigDir$/dnn.txt
     ]
     SGD=[
         epochSize=81920
@@ -99,8 +99,9 @@ reader=[
       labelType=Category
   ]
 ]
-RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu
+RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu
 DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining
 DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
@@ -109,7 +110,7 @@ DeviceId=0
 precision=float
 deviceId=0
 command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain
-ndlMacros=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/macros.txt
+ndlMacros=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/macros.txt
 GlobalMean=GlobalStats/mean.363
 GlobalInvStd=GlobalStats/var.363
 GlobalPrior=GlobalStats/prior.132
@@ -125,41 +126,41 @@ SGD=[
 ]
 DPT_Pre1=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn_1layer.txt
+        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn_1layer.txt
     ]
 ]
 AddLayer2=[    
     action=edit
     CurrLayer=1
     NewLayer=2
-    CurrModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
-    NewModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0
-    editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/add_layer.mel
+    CurrModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+    NewModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0
+    editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/add_layer.mel
 ]
 DPT_Pre2=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn_1layer.txt
+        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn_1layer.txt
     ]
 ]
 AddLayer3=[    
     action=edit
     CurrLayer=2
     NewLayer=3
-    CurrModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
-    NewModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0
-    editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/add_layer.mel
+    CurrModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+    NewModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0
+    editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/add_layer.mel
 ]
 speechTrain=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
     deviceId=0
     traceLevel=1
      NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn.txt
+        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn.txt
     ]
     SGD=[
         epochSize=81920
@@ -192,8 +193,9 @@ reader=[
       labelType=Category
   ]
 ]
-RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu
+RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu
 DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining
 DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
@@ -203,43 +205,44 @@ configparameters: cntk_dpt.config:AddLayer2=[
     action=edit
     CurrLayer=1
     NewLayer=2
-    CurrModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
-    NewModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0
-    editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/add_layer.mel
+    CurrModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+    NewModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0
+    editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/add_layer.mel
 ]
 
 configparameters: cntk_dpt.config:AddLayer3=[    
     action=edit
     CurrLayer=2
     NewLayer=3
-    CurrModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
-    NewModel=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0
-    editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/add_layer.mel
+    CurrModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+    NewModel=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0
+    editPath=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/add_layer.mel
 ]
 
 configparameters: cntk_dpt.config:command=DPT_Pre1:AddLayer2:DPT_Pre2:AddLayer3:speechTrain
+configparameters: cntk_dpt.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining
 configparameters: cntk_dpt.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
 configparameters: cntk_dpt.config:deviceId=0
 configparameters: cntk_dpt.config:DPT_Pre1=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn_1layer.txt
+        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn_1layer.txt
     ]
 ]
 
 configparameters: cntk_dpt.config:DPT_Pre2=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
     NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn_1layer.txt
+        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn_1layer.txt
     ]
 ]
 
 configparameters: cntk_dpt.config:GlobalInvStd=GlobalStats/var.363
 configparameters: cntk_dpt.config:GlobalMean=GlobalStats/mean.363
 configparameters: cntk_dpt.config:GlobalPrior=GlobalStats/prior.132
-configparameters: cntk_dpt.config:ndlMacros=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/macros.txt
+configparameters: cntk_dpt.config:ndlMacros=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/macros.txt
 configparameters: cntk_dpt.config:precision=float
 configparameters: cntk_dpt.config:reader=[
   readerType=HTKMLFReader
@@ -260,7 +263,7 @@ configparameters: cntk_dpt.config:reader=[
   ]
 ]
 
-configparameters: cntk_dpt.config:RunDir=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu
+configparameters: cntk_dpt.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu
 configparameters: cntk_dpt.config:SGD=[
     epochSize=81920
     minibatchSize=256
@@ -273,11 +276,11 @@ configparameters: cntk_dpt.config:SGD=[
 
 configparameters: cntk_dpt.config:speechTrain=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
     deviceId=0
     traceLevel=1
      NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/ndl/dnn.txt
+        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN\DiscriminativePreTraining/dnn.txt
     ]
     SGD=[
         epochSize=81920
@@ -297,11 +300,11 @@ configparameters: cntk_dpt.config:traceLevel=1
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: DPT_Pre1 AddLayer2 DPT_Pre2 AddLayer3 speechTrain 
 precision = float
-CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre1/cntkSpeech
 CNTKCommandTrainInfo: DPT_Pre1 : 2
-CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech
 CNTKCommandTrainInfo: DPT_Pre2 : 2
-CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech
 CNTKCommandTrainInfo: speechTrain : 4
 CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 8
 CNTKCommandTrainBegin: DPT_Pre1
@@ -409,6 +412,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
 
+Validating for node cr. 6 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 1]
@@ -450,7 +471,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL1.W = LearnableParameter -> [512, 363]
@@ -511,6 +532,25 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1]
 
+Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 1], logPrior[132, 1]) -> [132, MBSize 1]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -552,7 +592,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 6 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 1]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -610,6 +650,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
 
+Validating for node Err. 6 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 1], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 1]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 1]) -> [512, MBSize 1]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 1], HL1.b[512, 1]) -> [512, MBSize 1]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 1]) -> [512, MBSize 1]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 1]) -> [132, MBSize 1]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 1], OL.b[132, 1]) -> [132, MBSize 1]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 1]
@@ -630,6 +688,7 @@ Validating --> Err = ErrorPrediction(labels[132, MBSize 1], OL.z[132, MBSize 1])
 
 7 out of 15 nodes do not share the minibatch layout with the input data.
 
+SetUniformRandomValue (GPU): creating curand object with seed 1
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
 No PreCompute nodes found, skipping PreCompute step
@@ -639,78 +698,78 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
 Starting minibatch loop.
- Epoch[ 1 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  3.89978218; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.70651s; TotalTimePerSample = 0.27598ms; SamplesPerSecond = 3623
- Epoch[ 1 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.96755714; EvalErr[0]PerSample = 0.72031250; TotalTime = 0.28515s; TotalTimePerSample = 0.11139ms; SamplesPerSecond = 8977
- Epoch[ 1 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.55723495; EvalErr[0]PerSample = 0.65859375; TotalTime = 0.26848s; TotalTimePerSample = 0.10488ms; SamplesPerSecond = 9535
- Epoch[ 1 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.29642715; EvalErr[0]PerSample = 0.61992187; TotalTime = 0.25356s; TotalTimePerSample = 0.09905ms; SamplesPerSecond = 10096
- Epoch[ 1 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.02396469; EvalErr[0]PerSample = 0.55117187; TotalTime = 0.24481s; TotalTimePerSample = 0.09563ms; SamplesPerSecond = 10457
- Epoch[ 1 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.87309418; EvalErr[0]PerSample = 0.51484375; TotalTime = 0.23464s; TotalTimePerSample = 0.09166ms; SamplesPerSecond = 10910
- Epoch[ 1 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.78157196; EvalErr[0]PerSample = 0.50507813; TotalTime = 0.22702s; TotalTimePerSample = 0.08868ms; SamplesPerSecond = 11276
- Epoch[ 1 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.75391235; EvalErr[0]PerSample = 0.50781250; TotalTime = 0.21845s; TotalTimePerSample = 0.08533ms; SamplesPerSecond = 11719
- Epoch[ 1 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.66460266; EvalErr[0]PerSample = 0.45742187; TotalTime = 0.21084s; TotalTimePerSample = 0.08236ms; SamplesPerSecond = 12142
- Epoch[ 1 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.62184296; EvalErr[0]PerSample = 0.47968750; TotalTime = 0.20613s; TotalTimePerSample = 0.08052ms; SamplesPerSecond = 12419
+ Epoch[ 1 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  3.89978180; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.62266s; TotalTimePerSample = 0.24323ms; SamplesPerSecond = 4111
+ Epoch[ 1 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.96755676; EvalErr[0]PerSample = 0.72031250; TotalTime = 0.30410s; TotalTimePerSample = 0.11879ms; SamplesPerSecond = 8418
+ Epoch[ 1 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.55723495; EvalErr[0]PerSample = 0.65859375; TotalTime = 0.30677s; TotalTimePerSample = 0.11983ms; SamplesPerSecond = 8344
+ Epoch[ 1 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.29642792; EvalErr[0]PerSample = 0.61992187; TotalTime = 0.29877s; TotalTimePerSample = 0.11671ms; SamplesPerSecond = 8568
+ Epoch[ 1 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.02396469; EvalErr[0]PerSample = 0.55117187; TotalTime = 0.27956s; TotalTimePerSample = 0.10920ms; SamplesPerSecond = 9157
+ Epoch[ 1 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.87309265; EvalErr[0]PerSample = 0.51484375; TotalTime = 0.26339s; TotalTimePerSample = 0.10289ms; SamplesPerSecond = 9719
+ Epoch[ 1 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.78157196; EvalErr[0]PerSample = 0.50507813; TotalTime = 0.27964s; TotalTimePerSample = 0.10923ms; SamplesPerSecond = 9154
+ Epoch[ 1 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.75391235; EvalErr[0]PerSample = 0.50781250; TotalTime = 0.29762s; TotalTimePerSample = 0.11626ms; SamplesPerSecond = 8601
+ Epoch[ 1 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.66460266; EvalErr[0]PerSample = 0.45742187; TotalTime = 0.27883s; TotalTimePerSample = 0.10892ms; SamplesPerSecond = 9181
+ Epoch[ 1 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.62184143; EvalErr[0]PerSample = 0.47968750; TotalTime = 0.26243s; TotalTimePerSample = 0.10251ms; SamplesPerSecond = 9755
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.65328217; EvalErr[0]PerSample = 0.47265625; TotalTime = 0.20100s; TotalTimePerSample = 0.07851ms; SamplesPerSecond = 12736
- Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.50686798; EvalErr[0]PerSample = 0.44921875; TotalTime = 0.20189s; TotalTimePerSample = 0.07886ms; SamplesPerSecond = 12680
- Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.46723938; EvalErr[0]PerSample = 0.42304687; TotalTime = 0.20090s; TotalTimePerSample = 0.07847ms; SamplesPerSecond = 12742
- Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.49163513; EvalErr[0]PerSample = 0.44140625; TotalTime = 0.20162s; TotalTimePerSample = 0.07876ms; SamplesPerSecond = 12697
- Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.46437683; EvalErr[0]PerSample = 0.43398437; TotalTime = 0.20111s; TotalTimePerSample = 0.07856ms; SamplesPerSecond = 12729
- Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.43047485; EvalErr[0]PerSample = 0.43867187; TotalTime = 0.20070s; TotalTimePerSample = 0.07840ms; SamplesPerSecond = 12755
- Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.42105103; EvalErr[0]PerSample = 0.41992188; TotalTime = 0.20147s; TotalTimePerSample = 0.07870ms; SamplesPerSecond = 12706
- Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.46536560; EvalErr[0]PerSample = 0.42460938; TotalTime = 0.20084s; TotalTimePerSample = 0.07845ms; SamplesPerSecond = 12746
- Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.47426147; EvalErr[0]PerSample = 0.44062500; TotalTime = 0.20085s; TotalTimePerSample = 0.07846ms; SamplesPerSecond = 12745
- Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.42851257; EvalErr[0]PerSample = 0.44062500; TotalTime = 0.20094s; TotalTimePerSample = 0.07849ms; SamplesPerSecond = 12740
- Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.34079895; EvalErr[0]PerSample = 0.41171875; TotalTime = 0.20082s; TotalTimePerSample = 0.07844ms; SamplesPerSecond = 12747
- Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39476929; EvalErr[0]PerSample = 0.42773438; TotalTime = 0.20133s; TotalTimePerSample = 0.07864ms; SamplesPerSecond = 12715
- Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.40154724; EvalErr[0]PerSample = 0.41250000; TotalTime = 0.20108s; TotalTimePerSample = 0.07855ms; SamplesPerSecond = 12731
- Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39338379; EvalErr[0]PerSample = 0.42656250; TotalTime = 0.20143s; TotalTimePerSample = 0.07868ms; SamplesPerSecond = 12709
- Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32473145; EvalErr[0]PerSample = 0.40117188; TotalTime = 0.20309s; TotalTimePerSample = 0.07933ms; SamplesPerSecond = 12605
- Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27008972; EvalErr[0]PerSample = 0.39960937; TotalTime = 0.24300s; TotalTimePerSample = 0.09492ms; SamplesPerSecond = 10534
- Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32326355; EvalErr[0]PerSample = 0.39296875; TotalTime = 0.20162s; TotalTimePerSample = 0.07876ms; SamplesPerSecond = 12697
- Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.25377502; EvalErr[0]PerSample = 0.38359375; TotalTime = 0.20118s; TotalTimePerSample = 0.07859ms; SamplesPerSecond = 12725
- Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23349915; EvalErr[0]PerSample = 0.37070313; TotalTime = 0.20074s; TotalTimePerSample = 0.07842ms; SamplesPerSecond = 12752
- Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20884399; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.20064s; TotalTimePerSample = 0.07838ms; SamplesPerSecond = 12759
- Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23698425; EvalErr[0]PerSample = 0.36914063; TotalTime = 0.20078s; TotalTimePerSample = 0.07843ms; SamplesPerSecond = 12750
- Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22963867; EvalErr[0]PerSample = 0.37382813; TotalTime = 0.18087s; TotalTimePerSample = 0.07065ms; SamplesPerSecond = 14153
-Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.6516994; EvalErrPerSample = 0.46788332; Ave LearnRatePerSample = 0.003125000047; EpochTime=11.137228
+ Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.65328064; EvalErr[0]PerSample = 0.47265625; TotalTime = 0.24968s; TotalTimePerSample = 0.09753ms; SamplesPerSecond = 10253
+ Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.50686951; EvalErr[0]PerSample = 0.44921875; TotalTime = 0.23939s; TotalTimePerSample = 0.09351ms; SamplesPerSecond = 10693
+ Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.46723938; EvalErr[0]PerSample = 0.42304687; TotalTime = 0.28085s; TotalTimePerSample = 0.10971ms; SamplesPerSecond = 9115
+ Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.49163513; EvalErr[0]PerSample = 0.44140625; TotalTime = 0.31287s; TotalTimePerSample = 0.12222ms; SamplesPerSecond = 8182
+ Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.46437683; EvalErr[0]PerSample = 0.43398437; TotalTime = 0.29536s; TotalTimePerSample = 0.11538ms; SamplesPerSecond = 8667
+ Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.43047485; EvalErr[0]PerSample = 0.43867187; TotalTime = 0.28569s; TotalTimePerSample = 0.11160ms; SamplesPerSecond = 8960
+ Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.42106018; EvalErr[0]PerSample = 0.41992188; TotalTime = 0.30841s; TotalTimePerSample = 0.12047ms; SamplesPerSecond = 8300
+ Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.46538086; EvalErr[0]PerSample = 0.42421875; TotalTime = 0.28988s; TotalTimePerSample = 0.11323ms; SamplesPerSecond = 8831
+ Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.47427673; EvalErr[0]PerSample = 0.44062500; TotalTime = 0.30135s; TotalTimePerSample = 0.11772ms; SamplesPerSecond = 8495
+ Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.42847290; EvalErr[0]PerSample = 0.44023438; TotalTime = 0.31460s; TotalTimePerSample = 0.12289ms; SamplesPerSecond = 8137
+ Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.34078064; EvalErr[0]PerSample = 0.41171875; TotalTime = 0.30250s; TotalTimePerSample = 0.11816ms; SamplesPerSecond = 8462
+ Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39474487; EvalErr[0]PerSample = 0.42734375; TotalTime = 0.28411s; TotalTimePerSample = 0.11098ms; SamplesPerSecond = 9010
+ Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.40151062; EvalErr[0]PerSample = 0.41250000; TotalTime = 0.26734s; TotalTimePerSample = 0.10443ms; SamplesPerSecond = 9575
+ Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.39345703; EvalErr[0]PerSample = 0.42734375; TotalTime = 0.27080s; TotalTimePerSample = 0.10578ms; SamplesPerSecond = 9453
+ Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32485046; EvalErr[0]PerSample = 0.40156250; TotalTime = 0.31168s; TotalTimePerSample = 0.12175ms; SamplesPerSecond = 8213
+ Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27032471; EvalErr[0]PerSample = 0.39765625; TotalTime = 0.29405s; TotalTimePerSample = 0.11486ms; SamplesPerSecond = 8706
+ Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32375488; EvalErr[0]PerSample = 0.39257813; TotalTime = 0.28044s; TotalTimePerSample = 0.10955ms; SamplesPerSecond = 9128
+ Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.25393982; EvalErr[0]PerSample = 0.38320312; TotalTime = 0.31065s; TotalTimePerSample = 0.12135ms; SamplesPerSecond = 8240
+ Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23377075; EvalErr[0]PerSample = 0.36953125; TotalTime = 0.29165s; TotalTimePerSample = 0.11393ms; SamplesPerSecond = 8777
+ Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20861511; EvalErr[0]PerSample = 0.35976562; TotalTime = 0.31200s; TotalTimePerSample = 0.12187ms; SamplesPerSecond = 8205
+ Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23675232; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.29517s; TotalTimePerSample = 0.11530ms; SamplesPerSecond = 8672
+ Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22960205; EvalErr[0]PerSample = 0.37460938; TotalTime = 0.24842s; TotalTimePerSample = 0.09704ms; SamplesPerSecond = 10305
+Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.6517237; EvalErrPerSample = 0.46774903; Ave LearnRatePerSample = 0.003125000047; EpochTime=14.544218
 Starting Epoch 2: learning rate per sample = 0.003125  effective momentum = 0.900000 
 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21834393; EvalErr[0]PerSample = 0.37070313; TotalTime = 0.30891s; TotalTimePerSample = 0.12067ms; SamplesPerSecond = 8287
- Epoch[ 2 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18350792; EvalErr[0]PerSample = 0.36718750; TotalTime = 0.29223s; TotalTimePerSample = 0.11415ms; SamplesPerSecond = 8760
- Epoch[ 2 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17285366; EvalErr[0]PerSample = 0.35937500; TotalTime = 0.28457s; TotalTimePerSample = 0.11116ms; SamplesPerSecond = 8995
- Epoch[ 2 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20266953; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.26168s; TotalTimePerSample = 0.10222ms; SamplesPerSecond = 9782
- Epoch[ 2 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19678535; EvalErr[0]PerSample = 0.37890625; TotalTime = 0.24893s; TotalTimePerSample = 0.09724ms; SamplesPerSecond = 10283
- Epoch[ 2 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16507607; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.23976s; TotalTimePerSample = 0.09365ms; SamplesPerSecond = 10677
- Epoch[ 2 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13885193; EvalErr[0]PerSample = 0.34804687; TotalTime = 0.22890s; TotalTimePerSample = 0.08941ms; SamplesPerSecond = 11184
- Epoch[ 2 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19503098; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.22024s; TotalTimePerSample = 0.08603ms; SamplesPerSecond = 11623
- Epoch[ 2 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.24308472; EvalErr[0]PerSample = 0.37812500; TotalTime = 0.21525s; TotalTimePerSample = 0.08408ms; SamplesPerSecond = 11892
- Epoch[ 2 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19112320; EvalErr[0]PerSample = 0.36445312; TotalTime = 0.20867s; TotalTimePerSample = 0.08151ms; SamplesPerSecond = 12268
+ Epoch[ 2 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21869726; EvalErr[0]PerSample = 0.36992188; TotalTime = 0.29967s; TotalTimePerSample = 0.11706ms; SamplesPerSecond = 8542
+ Epoch[ 2 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18345709; EvalErr[0]PerSample = 0.36679688; TotalTime = 0.30109s; TotalTimePerSample = 0.11761ms; SamplesPerSecond = 8502
+ Epoch[ 2 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17220440; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.30479s; TotalTimePerSample = 0.11906ms; SamplesPerSecond = 8399
+ Epoch[ 2 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20035286; EvalErr[0]PerSample = 0.35781250; TotalTime = 0.30655s; TotalTimePerSample = 0.11975ms; SamplesPerSecond = 8350
+ Epoch[ 2 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19499779; EvalErr[0]PerSample = 0.37460938; TotalTime = 0.31306s; TotalTimePerSample = 0.12229ms; SamplesPerSecond = 8177
+ Epoch[ 2 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16373482; EvalErr[0]PerSample = 0.34687500; TotalTime = 0.29670s; TotalTimePerSample = 0.11590ms; SamplesPerSecond = 8628
+ Epoch[ 2 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13869247; EvalErr[0]PerSample = 0.34804687; TotalTime = 0.31042s; TotalTimePerSample = 0.12126ms; SamplesPerSecond = 8246
+ Epoch[ 2 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19293823; EvalErr[0]PerSample = 0.36992188; TotalTime = 0.29057s; TotalTimePerSample = 0.11350ms; SamplesPerSecond = 8810
+ Epoch[ 2 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23978348; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.28823s; TotalTimePerSample = 0.11259ms; SamplesPerSecond = 8881
+ Epoch[ 2 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18622742; EvalErr[0]PerSample = 0.36406250; TotalTime = 0.29621s; TotalTimePerSample = 0.11571ms; SamplesPerSecond = 8642
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16928406; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.20151s; TotalTimePerSample = 0.07871ms; SamplesPerSecond = 12704
- Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.24290924; EvalErr[0]PerSample = 0.38085938; TotalTime = 0.20117s; TotalTimePerSample = 0.07858ms; SamplesPerSecond = 12725
- Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17869263; EvalErr[0]PerSample = 0.35039063; TotalTime = 0.20060s; TotalTimePerSample = 0.07836ms; SamplesPerSecond = 12761
- Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21065826; EvalErr[0]PerSample = 0.36914063; TotalTime = 0.20114s; TotalTimePerSample = 0.07857ms; SamplesPerSecond = 12727
- Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19442291; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.20060s; TotalTimePerSample = 0.07836ms; SamplesPerSecond = 12761
- Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14822540; EvalErr[0]PerSample = 0.34453125; TotalTime = 0.20130s; TotalTimePerSample = 0.07863ms; SamplesPerSecond = 12717
- Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14246521; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.20146s; TotalTimePerSample = 0.07869ms; SamplesPerSecond = 12707
- Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18051453; EvalErr[0]PerSample = 0.35078125; TotalTime = 0.20061s; TotalTimePerSample = 0.07836ms; SamplesPerSecond = 12760
- Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15268555; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.20114s; TotalTimePerSample = 0.07857ms; SamplesPerSecond = 12727
- Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08914642; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.20109s; TotalTimePerSample = 0.07855ms; SamplesPerSecond = 12730
- Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14708710; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.20140s; TotalTimePerSample = 0.07867ms; SamplesPerSecond = 12711
- Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17114868; EvalErr[0]PerSample = 0.35625000; TotalTime = 0.20171s; TotalTimePerSample = 0.07879ms; SamplesPerSecond = 12691
- Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19283752; EvalErr[0]PerSample = 0.37539062; TotalTime = 0.20057s; TotalTimePerSample = 0.07835ms; SamplesPerSecond = 12763
- Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14776306; EvalErr[0]PerSample = 0.34921875; TotalTime = 0.20076s; TotalTimePerSample = 0.07842ms; SamplesPerSecond = 12751
- Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15021973; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.20060s; TotalTimePerSample = 0.07836ms; SamplesPerSecond = 12761
- Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08143616; EvalErr[0]PerSample = 0.32851562; TotalTime = 0.20156s; TotalTimePerSample = 0.07873ms; SamplesPerSecond = 12700
- Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09953003; EvalErr[0]PerSample = 0.34648438; TotalTime = 0.20113s; TotalTimePerSample = 0.07857ms; SamplesPerSecond = 12727
- Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06625977; EvalErr[0]PerSample = 0.33750000; TotalTime = 0.20081s; TotalTimePerSample = 0.07844ms; SamplesPerSecond = 12748
- Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09338989; EvalErr[0]PerSample = 0.33242187; TotalTime = 0.20097s; TotalTimePerSample = 0.07850ms; SamplesPerSecond = 12738
- Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15601807; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.20053s; TotalTimePerSample = 0.07833ms; SamplesPerSecond = 12766
- Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10992432; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.20125s; TotalTimePerSample = 0.07861ms; SamplesPerSecond = 12720
- Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.07529907; EvalErr[0]PerSample = 0.32890625; TotalTime = 0.18021s; TotalTimePerSample = 0.07040ms; SamplesPerSecond = 14205
-Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1596014; EvalErrPerSample = 0.35587159; Ave LearnRatePerSample = 0.003125000047; EpochTime=6.960407
+ Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16710815; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.26584s; TotalTimePerSample = 0.10384ms; SamplesPerSecond = 9629
+ Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.24683685; EvalErr[0]PerSample = 0.38554688; TotalTime = 0.28340s; TotalTimePerSample = 0.11070ms; SamplesPerSecond = 9033
+ Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18601685; EvalErr[0]PerSample = 0.35273437; TotalTime = 0.31050s; TotalTimePerSample = 0.12129ms; SamplesPerSecond = 8244
+ Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21721497; EvalErr[0]PerSample = 0.37617187; TotalTime = 0.30730s; TotalTimePerSample = 0.12004ms; SamplesPerSecond = 8330
+ Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19934692; EvalErr[0]PerSample = 0.36953125; TotalTime = 0.28662s; TotalTimePerSample = 0.11196ms; SamplesPerSecond = 8931
+ Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15099945; EvalErr[0]PerSample = 0.34257813; TotalTime = 0.27093s; TotalTimePerSample = 0.10583ms; SamplesPerSecond = 9448
+ Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14984589; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.25383s; TotalTimePerSample = 0.09915ms; SamplesPerSecond = 10085
+ Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19028320; EvalErr[0]PerSample = 0.35898438; TotalTime = 0.27065s; TotalTimePerSample = 0.10572ms; SamplesPerSecond = 9458
+ Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16434784; EvalErr[0]PerSample = 0.36406250; TotalTime = 0.31594s; TotalTimePerSample = 0.12341ms; SamplesPerSecond = 8102
+ Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08853760; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.30973s; TotalTimePerSample = 0.12099ms; SamplesPerSecond = 8265
+ Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15194244; EvalErr[0]PerSample = 0.35039063; TotalTime = 0.31183s; TotalTimePerSample = 0.12181ms; SamplesPerSecond = 8209
+ Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16113434; EvalErr[0]PerSample = 0.35625000; TotalTime = 0.31285s; TotalTimePerSample = 0.12221ms; SamplesPerSecond = 8182
+ Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18479004; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.29723s; TotalTimePerSample = 0.11611ms; SamplesPerSecond = 8612
+ Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14554138; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.27815s; TotalTimePerSample = 0.10865ms; SamplesPerSecond = 9203
+ Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15263367; EvalErr[0]PerSample = 0.35390625; TotalTime = 0.26319s; TotalTimePerSample = 0.10281ms; SamplesPerSecond = 9726
+ Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08563538; EvalErr[0]PerSample = 0.33437500; TotalTime = 0.24924s; TotalTimePerSample = 0.09736ms; SamplesPerSecond = 10271
+ Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10797424; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.23954s; TotalTimePerSample = 0.09357ms; SamplesPerSecond = 10687
+ Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.07031860; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.23508s; TotalTimePerSample = 0.09183ms; SamplesPerSecond = 10889
+ Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09429016; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.30298s; TotalTimePerSample = 0.11835ms; SamplesPerSecond = 8449
+ Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14634094; EvalErr[0]PerSample = 0.35351563; TotalTime = 0.29330s; TotalTimePerSample = 0.11457ms; SamplesPerSecond = 8728
+ Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10476990; EvalErr[0]PerSample = 0.34335938; TotalTime = 0.27493s; TotalTimePerSample = 0.10740ms; SamplesPerSecond = 9311
+ Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.07355957; EvalErr[0]PerSample = 0.32695313; TotalTime = 0.23165s; TotalTimePerSample = 0.09049ms; SamplesPerSecond = 11051
+Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1603298; EvalErrPerSample = 0.35574952; Ave LearnRatePerSample = 0.003125000047; EpochTime=9.225137
 CNTKCommandTrainEnd: DPT_Pre1
 
 
@@ -808,6 +867,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 6 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -849,7 +926,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL1.W = LearnableParameter -> [512, 363]
@@ -910,6 +987,25 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 7 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -951,7 +1047,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 6 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1009,6 +1105,24 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 6 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL1.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1077,7 +1191,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node cr. 3 nodes to process in pass 2.
+Validating for node cr. 9 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1150,6 +1264,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1201,6 +1338,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1253,6 +1414,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1304,6 +1489,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1354,6 +1562,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1388,7 +1619,7 @@ htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Spe
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
-Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0.
+Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/Pre2/cntkSpeech.0.
 
 
 Printing Gradient Computation Node Order ... 
@@ -1510,6 +1741,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1561,7 +1815,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL2.W = LearnableParameter -> [512, 512]
@@ -1637,6 +1891,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1688,7 +1966,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 9 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -1761,6 +2039,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -1795,78 +2096,78 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
 Starting minibatch loop.
- Epoch[ 1 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  4.36024933; EvalErr[0]PerSample = 0.80703125; TotalTime = 0.25347s; TotalTimePerSample = 0.09901ms; SamplesPerSecond = 10099
- Epoch[ 1 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.80374603; EvalErr[0]PerSample = 0.67890625; TotalTime = 0.23590s; TotalTimePerSample = 0.09215ms; SamplesPerSecond = 10852
- Epoch[ 1 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.23118515; EvalErr[0]PerSample = 0.59218750; TotalTime = 0.22699s; TotalTimePerSample = 0.08867ms; SamplesPerSecond = 11277
- Epoch[ 1 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.89543457; EvalErr[0]PerSample = 0.50625000; TotalTime = 0.22699s; TotalTimePerSample = 0.08867ms; SamplesPerSecond = 11278
- Epoch[ 1 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.69047775; EvalErr[0]PerSample = 0.47460938; TotalTime = 0.22702s; TotalTimePerSample = 0.08868ms; SamplesPerSecond = 11276
- Epoch[ 1 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.58694305; EvalErr[0]PerSample = 0.45898438; TotalTime = 0.22739s; TotalTimePerSample = 0.08882ms; SamplesPerSecond = 11258
- Epoch[ 1 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.48457794; EvalErr[0]PerSample = 0.43281250; TotalTime = 0.22649s; TotalTimePerSample = 0.08847ms; SamplesPerSecond = 11303
- Epoch[ 1 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.48614807; EvalErr[0]PerSample = 0.43203125; TotalTime = 0.22697s; TotalTimePerSample = 0.08866ms; SamplesPerSecond = 11278
- Epoch[ 1 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.45286255; EvalErr[0]PerSample = 0.41992188; TotalTime = 0.22646s; TotalTimePerSample = 0.08846ms; SamplesPerSecond = 11304
- Epoch[ 1 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41294861; EvalErr[0]PerSample = 0.40937500; TotalTime = 0.22741s; TotalTimePerSample = 0.08883ms; SamplesPerSecond = 11257
+ Epoch[ 1 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  4.49739113; EvalErr[0]PerSample = 0.80429688; TotalTime = 0.30184s; TotalTimePerSample = 0.11790ms; SamplesPerSecond = 8481
+ Epoch[ 1 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.83226433; EvalErr[0]PerSample = 0.68125000; TotalTime = 0.26951s; TotalTimePerSample = 0.10528ms; SamplesPerSecond = 9498
+ Epoch[ 1 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.25921097; EvalErr[0]PerSample = 0.59921875; TotalTime = 0.25048s; TotalTimePerSample = 0.09784ms; SamplesPerSecond = 10220
+ Epoch[ 1 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.91240921; EvalErr[0]PerSample = 0.51210937; TotalTime = 0.26010s; TotalTimePerSample = 0.10160ms; SamplesPerSecond = 9842
+ Epoch[ 1 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.69259949; EvalErr[0]PerSample = 0.46679688; TotalTime = 0.34494s; TotalTimePerSample = 0.13474ms; SamplesPerSecond = 7421
+ Epoch[ 1 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.59069672; EvalErr[0]PerSample = 0.45312500; TotalTime = 0.33370s; TotalTimePerSample = 0.13035ms; SamplesPerSecond = 7671
+ Epoch[ 1 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.48813324; EvalErr[0]PerSample = 0.43789062; TotalTime = 0.32515s; TotalTimePerSample = 0.12701ms; SamplesPerSecond = 7873
+ Epoch[ 1 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.48960571; EvalErr[0]PerSample = 0.43515625; TotalTime = 0.30350s; TotalTimePerSample = 0.11856ms; SamplesPerSecond = 8434
+ Epoch[ 1 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.45628204; EvalErr[0]PerSample = 0.42187500; TotalTime = 0.28491s; TotalTimePerSample = 0.11129ms; SamplesPerSecond = 8985
+ Epoch[ 1 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41567383; EvalErr[0]PerSample = 0.40820313; TotalTime = 0.27054s; TotalTimePerSample = 0.10568ms; SamplesPerSecond = 9462
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.41962891; EvalErr[0]PerSample = 0.41132812; TotalTime = 0.22724s; TotalTimePerSample = 0.08877ms; SamplesPerSecond = 11265
- Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33667145; EvalErr[0]PerSample = 0.39335938; TotalTime = 0.22657s; TotalTimePerSample = 0.08851ms; SamplesPerSecond = 11298
- Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.31494751; EvalErr[0]PerSample = 0.38281250; TotalTime = 0.22699s; TotalTimePerSample = 0.08867ms; SamplesPerSecond = 11278
- Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33030090; EvalErr[0]PerSample = 0.39648438; TotalTime = 0.22693s; TotalTimePerSample = 0.08864ms; SamplesPerSecond = 11281
- Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.31785889; EvalErr[0]PerSample = 0.38789062; TotalTime = 0.22728s; TotalTimePerSample = 0.08878ms; SamplesPerSecond = 11263
- Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27231445; EvalErr[0]PerSample = 0.38125000; TotalTime = 0.22750s; TotalTimePerSample = 0.08887ms; SamplesPerSecond = 11252
- Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.29524231; EvalErr[0]PerSample = 0.38359375; TotalTime = 0.22861s; TotalTimePerSample = 0.08930ms; SamplesPerSecond = 11198
- Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32531738; EvalErr[0]PerSample = 0.39023438; TotalTime = 0.24451s; TotalTimePerSample = 0.09551ms; SamplesPerSecond = 10470
- Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33986511; EvalErr[0]PerSample = 0.41367188; TotalTime = 0.22770s; TotalTimePerSample = 0.08895ms; SamplesPerSecond = 11242
- Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.31717529; EvalErr[0]PerSample = 0.41093750; TotalTime = 0.22722s; TotalTimePerSample = 0.08876ms; SamplesPerSecond = 11266
- Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23374634; EvalErr[0]PerSample = 0.37695313; TotalTime = 0.22688s; TotalTimePerSample = 0.08862ms; SamplesPerSecond = 11283
- Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26781921; EvalErr[0]PerSample = 0.38867188; TotalTime = 0.22676s; TotalTimePerSample = 0.08858ms; SamplesPerSecond = 11289
- Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26815796; EvalErr[0]PerSample = 0.37890625; TotalTime = 0.22765s; TotalTimePerSample = 0.08893ms; SamplesPerSecond = 11245
- Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23847656; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.22690s; TotalTimePerSample = 0.08863ms; SamplesPerSecond = 11282
- Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21604004; EvalErr[0]PerSample = 0.36640625; TotalTime = 0.22660s; TotalTimePerSample = 0.08851ms; SamplesPerSecond = 11297
- Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18456726; EvalErr[0]PerSample = 0.36562500; TotalTime = 0.26425s; TotalTimePerSample = 0.10322ms; SamplesPerSecond = 9687
- Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.24215698; EvalErr[0]PerSample = 0.36796875; TotalTime = 0.32872s; TotalTimePerSample = 0.12841ms; SamplesPerSecond = 7787
- Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18797607; EvalErr[0]PerSample = 0.36445312; TotalTime = 0.33424s; TotalTimePerSample = 0.13056ms; SamplesPerSecond = 7659
- Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16840210; EvalErr[0]PerSample = 0.35351563; TotalTime = 0.34198s; TotalTimePerSample = 0.13359ms; SamplesPerSecond = 7485
- Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14445496; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.33989s; TotalTimePerSample = 0.13277ms; SamplesPerSecond = 7531
- Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17367554; EvalErr[0]PerSample = 0.35039063; TotalTime = 0.32337s; TotalTimePerSample = 0.12632ms; SamplesPerSecond = 7916
- Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18674622; EvalErr[0]PerSample = 0.36875000; TotalTime = 0.27449s; TotalTimePerSample = 0.10722ms; SamplesPerSecond = 9326
-Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.5058161; EvalErrPerSample = 0.42365724; Ave LearnRatePerSample = 0.003125000047; EpochTime=11.880998
+ Epoch[ 1 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.42048950; EvalErr[0]PerSample = 0.41406250; TotalTime = 0.25862s; TotalTimePerSample = 0.10102ms; SamplesPerSecond = 9898
+ Epoch[ 1 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.34279480; EvalErr[0]PerSample = 0.39726563; TotalTime = 0.24826s; TotalTimePerSample = 0.09698ms; SamplesPerSecond = 10311
+ Epoch[ 1 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.31633148; EvalErr[0]PerSample = 0.38789062; TotalTime = 0.29231s; TotalTimePerSample = 0.11418ms; SamplesPerSecond = 8757
+ Epoch[ 1 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33296814; EvalErr[0]PerSample = 0.39804688; TotalTime = 0.34247s; TotalTimePerSample = 0.13378ms; SamplesPerSecond = 7475
+ Epoch[ 1 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32084351; EvalErr[0]PerSample = 0.39609375; TotalTime = 0.34517s; TotalTimePerSample = 0.13483ms; SamplesPerSecond = 7416
+ Epoch[ 1 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27189636; EvalErr[0]PerSample = 0.38125000; TotalTime = 0.34273s; TotalTimePerSample = 0.13388ms; SamplesPerSecond = 7469
+ Epoch[ 1 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.29380188; EvalErr[0]PerSample = 0.38554688; TotalTime = 0.33912s; TotalTimePerSample = 0.13247ms; SamplesPerSecond = 7548
+ Epoch[ 1 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.31463013; EvalErr[0]PerSample = 0.38984375; TotalTime = 0.32910s; TotalTimePerSample = 0.12855ms; SamplesPerSecond = 7778
+ Epoch[ 1 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.33578796; EvalErr[0]PerSample = 0.40664062; TotalTime = 0.34127s; TotalTimePerSample = 0.13331ms; SamplesPerSecond = 7501
+ Epoch[ 1 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.32202454; EvalErr[0]PerSample = 0.41484375; TotalTime = 0.31738s; TotalTimePerSample = 0.12398ms; SamplesPerSecond = 8066
+ Epoch[ 1 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23669434; EvalErr[0]PerSample = 0.37460938; TotalTime = 0.32630s; TotalTimePerSample = 0.12746ms; SamplesPerSecond = 7845
+ Epoch[ 1 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.27109985; EvalErr[0]PerSample = 0.38906250; TotalTime = 0.34553s; TotalTimePerSample = 0.13497ms; SamplesPerSecond = 7408
+ Epoch[ 1 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26419678; EvalErr[0]PerSample = 0.37578125; TotalTime = 0.33855s; TotalTimePerSample = 0.13224ms; SamplesPerSecond = 7561
+ Epoch[ 1 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23778992; EvalErr[0]PerSample = 0.37265625; TotalTime = 0.32655s; TotalTimePerSample = 0.12756ms; SamplesPerSecond = 7839
+ Epoch[ 1 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21040344; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.34363s; TotalTimePerSample = 0.13423ms; SamplesPerSecond = 7449
+ Epoch[ 1 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18387146; EvalErr[0]PerSample = 0.36562500; TotalTime = 0.33771s; TotalTimePerSample = 0.13192ms; SamplesPerSecond = 7580
+ Epoch[ 1 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23827515; EvalErr[0]PerSample = 0.37148437; TotalTime = 0.32123s; TotalTimePerSample = 0.12548ms; SamplesPerSecond = 7969
+ Epoch[ 1 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18418274; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.31703s; TotalTimePerSample = 0.12384ms; SamplesPerSecond = 8074
+ Epoch[ 1 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16851501; EvalErr[0]PerSample = 0.35234375; TotalTime = 0.34084s; TotalTimePerSample = 0.13314ms; SamplesPerSecond = 7510
+ Epoch[ 1 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14337463; EvalErr[0]PerSample = 0.34375000; TotalTime = 0.34387s; TotalTimePerSample = 0.13432ms; SamplesPerSecond = 7444
+ Epoch[ 1 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17227478; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.34417s; TotalTimePerSample = 0.13444ms; SamplesPerSecond = 7438
+ Epoch[ 1 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18431091; EvalErr[0]PerSample = 0.36835937; TotalTime = 0.30969s; TotalTimePerSample = 0.12097ms; SamplesPerSecond = 8266
+Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 1.5125258; EvalErrPerSample = 0.42452392; Ave LearnRatePerSample = 0.003125000047; EpochTime=14.904725
 Starting Epoch 2: learning rate per sample = 0.003125  effective momentum = 0.900000 
 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17634354; EvalErr[0]PerSample = 0.35351563; TotalTime = 0.34560s; TotalTimePerSample = 0.13500ms; SamplesPerSecond = 7407
- Epoch[ 2 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14589901; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.34276s; TotalTimePerSample = 0.13389ms; SamplesPerSecond = 7468
- Epoch[ 2 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15637836; EvalErr[0]PerSample = 0.35117188; TotalTime = 0.32955s; TotalTimePerSample = 0.12873ms; SamplesPerSecond = 7768
- Epoch[ 2 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14355202; EvalErr[0]PerSample = 0.34179688; TotalTime = 0.33820s; TotalTimePerSample = 0.13211ms; SamplesPerSecond = 7569
- Epoch[ 2 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14718361; EvalErr[0]PerSample = 0.36093750; TotalTime = 0.33625s; TotalTimePerSample = 0.13135ms; SamplesPerSecond = 7613
- Epoch[ 2 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14584732; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.33316s; TotalTimePerSample = 0.13014ms; SamplesPerSecond = 7684
- Epoch[ 2 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09562225; EvalErr[0]PerSample = 0.33789063; TotalTime = 0.32798s; TotalTimePerSample = 0.12812ms; SamplesPerSecond = 7805
- Epoch[ 2 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16124268; EvalErr[0]PerSample = 0.35859375; TotalTime = 0.33002s; TotalTimePerSample = 0.12892ms; SamplesPerSecond = 7757
- Epoch[ 2 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16578064; EvalErr[0]PerSample = 0.36210938; TotalTime = 0.35216s; TotalTimePerSample = 0.13756ms; SamplesPerSecond = 7269
- Epoch[ 2 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12110596; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.34197s; TotalTimePerSample = 0.13358ms; SamplesPerSecond = 7486
+ Epoch[ 2 of 2]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17448177; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.32358s; TotalTimePerSample = 0.12640ms; SamplesPerSecond = 7911
+ Epoch[ 2 of 2]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14536781; EvalErr[0]PerSample = 0.35664062; TotalTime = 0.32840s; TotalTimePerSample = 0.12828ms; SamplesPerSecond = 7795
+ Epoch[ 2 of 2]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.15722904; EvalErr[0]PerSample = 0.34531250; TotalTime = 0.33313s; TotalTimePerSample = 0.13013ms; SamplesPerSecond = 7684
+ Epoch[ 2 of 2]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14344521; EvalErr[0]PerSample = 0.34804687; TotalTime = 0.34423s; TotalTimePerSample = 0.13446ms; SamplesPerSecond = 7436
+ Epoch[ 2 of 2]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14842377; EvalErr[0]PerSample = 0.36562500; TotalTime = 0.33629s; TotalTimePerSample = 0.13136ms; SamplesPerSecond = 7612
+ Epoch[ 2 of 2]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.14489059; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.32385s; TotalTimePerSample = 0.12650ms; SamplesPerSecond = 7905
+ Epoch[ 2 of 2]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09631195; EvalErr[0]PerSample = 0.33984375; TotalTime = 0.30827s; TotalTimePerSample = 0.12042ms; SamplesPerSecond = 8304
+ Epoch[ 2 of 2]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16026917; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.33824s; TotalTimePerSample = 0.13212ms; SamplesPerSecond = 7568
+ Epoch[ 2 of 2]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16528091; EvalErr[0]PerSample = 0.36015625; TotalTime = 0.34935s; TotalTimePerSample = 0.13646ms; SamplesPerSecond = 7327
+ Epoch[ 2 of 2]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12257309; EvalErr[0]PerSample = 0.34492187; TotalTime = 0.30700s; TotalTimePerSample = 0.11992ms; SamplesPerSecond = 8338
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12307053; EvalErr[0]PerSample = 0.35000000; TotalTime = 0.32871s; TotalTimePerSample = 0.12840ms; SamplesPerSecond = 7787
- Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18570023; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.32470s; TotalTimePerSample = 0.12683ms; SamplesPerSecond = 7884
- Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12998352; EvalErr[0]PerSample = 0.33789063; TotalTime = 0.33742s; TotalTimePerSample = 0.13181ms; SamplesPerSecond = 7586
- Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16931915; EvalErr[0]PerSample = 0.35429688; TotalTime = 0.34228s; TotalTimePerSample = 0.13370ms; SamplesPerSecond = 7479
- Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11923828; EvalErr[0]PerSample = 0.34335938; TotalTime = 0.33777s; TotalTimePerSample = 0.13194ms; SamplesPerSecond = 7579
- Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09157715; EvalErr[0]PerSample = 0.33789063; TotalTime = 0.34325s; TotalTimePerSample = 0.13408ms; SamplesPerSecond = 7458
- Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10869598; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.34204s; TotalTimePerSample = 0.13361ms; SamplesPerSecond = 7484
- Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12568817; EvalErr[0]PerSample = 0.33515625; TotalTime = 0.36284s; TotalTimePerSample = 0.14173ms; SamplesPerSecond = 7055
- Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10168304; EvalErr[0]PerSample = 0.33945313; TotalTime = 0.33356s; TotalTimePerSample = 0.13030ms; SamplesPerSecond = 7674
- Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05705414; EvalErr[0]PerSample = 0.33281250; TotalTime = 0.31607s; TotalTimePerSample = 0.12347ms; SamplesPerSecond = 8099
- Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10415344; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.33578s; TotalTimePerSample = 0.13116ms; SamplesPerSecond = 7624
- Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13464966; EvalErr[0]PerSample = 0.34375000; TotalTime = 0.34351s; TotalTimePerSample = 0.13418ms; SamplesPerSecond = 7452
- Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12070007; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.32422s; TotalTimePerSample = 0.12665ms; SamplesPerSecond = 7895
- Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10966797; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.33828s; TotalTimePerSample = 0.13214ms; SamplesPerSecond = 7567
- Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09540100; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.33439s; TotalTimePerSample = 0.13062ms; SamplesPerSecond = 7655
- Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05154724; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.31050s; TotalTimePerSample = 0.12129ms; SamplesPerSecond = 8244
- Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.04845581; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.29355s; TotalTimePerSample = 0.11467ms; SamplesPerSecond = 8720
- Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.03578491; EvalErr[0]PerSample = 0.32226563; TotalTime = 0.27642s; TotalTimePerSample = 0.10798ms; SamplesPerSecond = 9261
- Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05906372; EvalErr[0]PerSample = 0.32539062; TotalTime = 0.26455s; TotalTimePerSample = 0.10334ms; SamplesPerSecond = 9676
- Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11199341; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.25517s; TotalTimePerSample = 0.09968ms; SamplesPerSecond = 10032
- Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08496399; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.24490s; TotalTimePerSample = 0.09566ms; SamplesPerSecond = 10453
- Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.04550171; EvalErr[0]PerSample = 0.31914063; TotalTime = 0.21540s; TotalTimePerSample = 0.08414ms; SamplesPerSecond = 11884
-Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1147765; EvalErrPerSample = 0.34230956; Ave LearnRatePerSample = 0.003125000047; EpochTime=10.335151
+ Epoch[ 2 of 2]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12313004; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.33577s; TotalTimePerSample = 0.13116ms; SamplesPerSecond = 7624
+ Epoch[ 2 of 2]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18492050; EvalErr[0]PerSample = 0.36171875; TotalTime = 0.33462s; TotalTimePerSample = 0.13071ms; SamplesPerSecond = 7650
+ Epoch[ 2 of 2]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13058014; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.31019s; TotalTimePerSample = 0.12117ms; SamplesPerSecond = 8253
+ Epoch[ 2 of 2]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16725922; EvalErr[0]PerSample = 0.35781250; TotalTime = 0.29240s; TotalTimePerSample = 0.11422ms; SamplesPerSecond = 8755
+ Epoch[ 2 of 2]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12244568; EvalErr[0]PerSample = 0.34648438; TotalTime = 0.37010s; TotalTimePerSample = 0.14457ms; SamplesPerSecond = 6917
+ Epoch[ 2 of 2]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09480591; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.30263s; TotalTimePerSample = 0.11822ms; SamplesPerSecond = 8459
+ Epoch[ 2 of 2]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11218109; EvalErr[0]PerSample = 0.34140625; TotalTime = 0.33987s; TotalTimePerSample = 0.13276ms; SamplesPerSecond = 7532
+ Epoch[ 2 of 2]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11966095; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.31333s; TotalTimePerSample = 0.12240ms; SamplesPerSecond = 8170
+ Epoch[ 2 of 2]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10485687; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.29728s; TotalTimePerSample = 0.11613ms; SamplesPerSecond = 8611
+ Epoch[ 2 of 2]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.06019897; EvalErr[0]PerSample = 0.32617188; TotalTime = 0.27964s; TotalTimePerSample = 0.10923ms; SamplesPerSecond = 9154
+ Epoch[ 2 of 2]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10600891; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.26485s; TotalTimePerSample = 0.10346ms; SamplesPerSecond = 9665
+ Epoch[ 2 of 2]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13724976; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.31315s; TotalTimePerSample = 0.12232ms; SamplesPerSecond = 8175
+ Epoch[ 2 of 2]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12464752; EvalErr[0]PerSample = 0.34609375; TotalTime = 0.35469s; TotalTimePerSample = 0.13855ms; SamplesPerSecond = 7217
+ Epoch[ 2 of 2]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10831604; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.33481s; TotalTimePerSample = 0.13079ms; SamplesPerSecond = 7646
+ Epoch[ 2 of 2]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.09707031; EvalErr[0]PerSample = 0.34023437; TotalTime = 0.33923s; TotalTimePerSample = 0.13251ms; SamplesPerSecond = 7546
+ Epoch[ 2 of 2]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.04812317; EvalErr[0]PerSample = 0.32773438; TotalTime = 0.33522s; TotalTimePerSample = 0.13094ms; SamplesPerSecond = 7636
+ Epoch[ 2 of 2]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.04979248; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.33766s; TotalTimePerSample = 0.13190ms; SamplesPerSecond = 7581
+ Epoch[ 2 of 2]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.03223572; EvalErr[0]PerSample = 0.31835938; TotalTime = 0.31202s; TotalTimePerSample = 0.12188ms; SamplesPerSecond = 8204
+ Epoch[ 2 of 2]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.05677490; EvalErr[0]PerSample = 0.32773438; TotalTime = 0.34007s; TotalTimePerSample = 0.13284ms; SamplesPerSecond = 7527
+ Epoch[ 2 of 2]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10880737; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.34820s; TotalTimePerSample = 0.13601ms; SamplesPerSecond = 7352
+ Epoch[ 2 of 2]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.08513489; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.29751s; TotalTimePerSample = 0.11622ms; SamplesPerSecond = 8604
+ Epoch[ 2 of 2]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.04244080; EvalErr[0]PerSample = 0.31757812; TotalTime = 0.24686s; TotalTimePerSample = 0.09643ms; SamplesPerSecond = 10370
+Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 1.1148411; EvalErrPerSample = 0.34190676; Ave LearnRatePerSample = 0.003125000047; EpochTime=10.343029
 CNTKCommandTrainEnd: DPT_Pre2
 
 
@@ -1989,6 +2290,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2040,7 +2364,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL2.W = LearnableParameter -> [512, 512]
@@ -2116,6 +2440,30 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 10 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2167,7 +2515,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 9 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2240,6 +2588,29 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL2.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2323,7 +2694,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node cr. 3 nodes to process in pass 2.
+Validating for node cr. 12 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2411,6 +2782,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2472,6 +2871,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2534,6 +2962,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -2595,6 +3052,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2655,6 +3140,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2694,7 +3207,7 @@ htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Spe
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
-Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151012183455.742480\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0.
+Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151024140953.757570\Speech\DNN_DiscriminativePreTraining@debug_gpu/models/cntkSpeech.0.
 
 
 Printing Gradient Computation Node Order ... 
@@ -2841,6 +3354,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node cr. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node cr, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -2902,7 +3443,7 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
-Validating for node ScaledLogLikelihood. 2 nodes to process in pass 2.
+Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
 Validating --> HL3.W = LearnableParameter -> [512, 512]
@@ -2993,6 +3534,35 @@ Validating --> GlobalPrior = LearnableParameter -> [132, 1]
 Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
 Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
 
+Validating for node ScaledLogLikelihood. 13 nodes to process in pass 2.
+
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> GlobalPrior = LearnableParameter -> [132, 1]
+Validating --> logPrior = Log(GlobalPrior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(OL.z[132, MBSize 0], logPrior[132, 1]) -> [132, MBSize 0]
+
 Validating for node ScaledLogLikelihood, final verification.
 
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -3054,7 +3624,7 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
-Validating for node Err. 1 nodes to process in pass 2.
+Validating for node Err. 12 nodes to process in pass 2.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
 Validating --> OL.W = LearnableParameter -> [132, 512]
@@ -3142,6 +3712,34 @@ Validating --> OL.b = LearnableParameter -> [132, 1]
 Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
 Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
 
+Validating for node Err. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> OL.W = LearnableParameter -> [132, 512]
+Validating --> HL3.W = LearnableParameter -> [512, 512]
+Validating --> HL2.W = LearnableParameter -> [512, 512]
+Validating --> HL1.W = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> GlobalMean = LearnableParameter -> [363, 1]
+Validating --> GlobalInvStd = LearnableParameter -> [363, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(features[363, MBSize 0], GlobalMean[363, 1], GlobalInvStd[363, 1]) -> [363, MBSize 0]
+Validating --> HL1.t = Times(HL1.W[512, 363], featNorm[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL1.b = LearnableParameter -> [512, 1]
+Validating --> HL1.z = Plus(HL1.t[512, MBSize 0], HL1.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL1.y = Sigmoid(HL1.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.t = Times(HL2.W[512, 512], HL1.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL2.b = LearnableParameter -> [512, 1]
+Validating --> HL2.z = Plus(HL2.t[512, MBSize 0], HL2.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL2.y = Sigmoid(HL2.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.t = Times(HL3.W[512, 512], HL2.y[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> HL3.b = LearnableParameter -> [512, 1]
+Validating --> HL3.z = Plus(HL3.t[512, MBSize 0], HL3.b[512, 1]) -> [512, MBSize 0]
+Validating --> HL3.y = Sigmoid(HL3.z[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> OL.t = Times(OL.W[132, 512], HL3.y[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> OL.b = LearnableParameter -> [132, 1]
+Validating --> OL.z = Plus(OL.t[132, MBSize 0], OL.b[132, 1]) -> [132, MBSize 0]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 0], OL.z[132, MBSize 0]) -> [1, 1]
+
 Validating for node Err, final verification.
 
 Validating --> labels = InputValue -> [132, MBSize 0]
@@ -3181,105 +3779,105 @@ minibatchiterator: epoch 0: frames [0..81920] (first utterance at frame 0), data
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
 Starting minibatch loop.
- Epoch[ 1 of 4]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  4.10662956; EvalErr[0]PerSample = 0.82890625; TotalTime = 0.27483s; TotalTimePerSample = 0.10736ms; SamplesPerSecond = 9314
- Epoch[ 1 of 4]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.55908470; EvalErr[0]PerSample = 0.63164062; TotalTime = 0.25240s; TotalTimePerSample = 0.09859ms; SamplesPerSecond = 10142
- Epoch[ 1 of 4]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.03446350; EvalErr[0]PerSample = 0.53906250; TotalTime = 0.25247s; TotalTimePerSample = 0.09862ms; SamplesPerSecond = 10139
- Epoch[ 1 of 4]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.73968811; EvalErr[0]PerSample = 0.47812500; TotalTime = 0.25275s; TotalTimePerSample = 0.09873ms; SamplesPerSecond = 10128
- Epoch[ 1 of 4]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.54626236; EvalErr[0]PerSample = 0.43867187; TotalTime = 0.25343s; TotalTimePerSample = 0.09900ms; SamplesPerSecond = 10101
- Epoch[ 1 of 4]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.44772797; EvalErr[0]PerSample = 0.41171875; TotalTime = 0.25254s; TotalTimePerSample = 0.09865ms; SamplesPerSecond = 10137
- Epoch[ 1 of 4]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.36287384; EvalErr[0]PerSample = 0.40937500; TotalTime = 0.35783s; TotalTimePerSample = 0.13978ms; SamplesPerSecond = 7154
- Epoch[ 1 of 4]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.36141815; EvalErr[0]PerSample = 0.39921875; TotalTime = 0.35367s; TotalTimePerSample = 0.13815ms; SamplesPerSecond = 7238
- Epoch[ 1 of 4]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.34428864; EvalErr[0]PerSample = 0.38710937; TotalTime = 0.37068s; TotalTimePerSample = 0.14480ms; SamplesPerSecond = 6906
- Epoch[ 1 of 4]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.30752716; EvalErr[0]PerSample = 0.38242188; TotalTime = 0.34724s; TotalTimePerSample = 0.13564ms; SamplesPerSecond = 7372
+ Epoch[ 1 of 4]-Minibatch[   1-  10 of 320]: SamplesSeen = 2560; TrainLossPerSample =  4.12455330; EvalErr[0]PerSample = 0.82734375; TotalTime = 0.37556s; TotalTimePerSample = 0.14670ms; SamplesPerSecond = 6816
+ Epoch[ 1 of 4]-Minibatch[  11-  20 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.55599785; EvalErr[0]PerSample = 0.63007813; TotalTime = 0.36775s; TotalTimePerSample = 0.14365ms; SamplesPerSecond = 6961
+ Epoch[ 1 of 4]-Minibatch[  21-  30 of 320]: SamplesSeen = 2560; TrainLossPerSample =  2.03516159; EvalErr[0]PerSample = 0.53945312; TotalTime = 0.38102s; TotalTimePerSample = 0.14884ms; SamplesPerSecond = 6718
+ Epoch[ 1 of 4]-Minibatch[  31-  40 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.73739853; EvalErr[0]PerSample = 0.47500000; TotalTime = 0.36620s; TotalTimePerSample = 0.14305ms; SamplesPerSecond = 6990
+ Epoch[ 1 of 4]-Minibatch[  41-  50 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.54207916; EvalErr[0]PerSample = 0.43515625; TotalTime = 0.34129s; TotalTimePerSample = 0.13332ms; SamplesPerSecond = 7500
+ Epoch[ 1 of 4]-Minibatch[  51-  60 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.44409790; EvalErr[0]PerSample = 0.41328125; TotalTime = 0.31795s; TotalTimePerSample = 0.12420ms; SamplesPerSecond = 8051
+ Epoch[ 1 of 4]-Minibatch[  61-  70 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.36059418; EvalErr[0]PerSample = 0.40898438; TotalTime = 0.34866s; TotalTimePerSample = 0.13620ms; SamplesPerSecond = 7342
+ Epoch[ 1 of 4]-Minibatch[  71-  80 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.35930023; EvalErr[0]PerSample = 0.40117188; TotalTime = 0.36478s; TotalTimePerSample = 0.14249ms; SamplesPerSecond = 7018
+ Epoch[ 1 of 4]-Minibatch[  81-  90 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.34254303; EvalErr[0]PerSample = 0.38632813; TotalTime = 0.35487s; TotalTimePerSample = 0.13862ms; SamplesPerSecond = 7213
+ Epoch[ 1 of 4]-Minibatch[  91- 100 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.30505676; EvalErr[0]PerSample = 0.38320312; TotalTime = 0.35420s; TotalTimePerSample = 0.13836ms; SamplesPerSecond = 7227
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 1 of 4]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.30951538; EvalErr[0]PerSample = 0.38671875; TotalTime = 0.37789s; TotalTimePerSample = 0.14761ms; SamplesPerSecond = 6774
- Epoch[ 1 of 4]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23730469; EvalErr[0]PerSample = 0.36914063; TotalTime = 0.37117s; TotalTimePerSample = 0.14499ms; SamplesPerSecond = 6897
- Epoch[ 1 of 4]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21422424; EvalErr[0]PerSample = 0.35625000; TotalTime = 0.37390s; TotalTimePerSample = 0.14606ms; SamplesPerSecond = 6846
- Epoch[ 1 of 4]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23798065; EvalErr[0]PerSample = 0.37421875; TotalTime = 0.34378s; TotalTimePerSample = 0.13429ms; SamplesPerSecond = 7446
- Epoch[ 1 of 4]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23455658; EvalErr[0]PerSample = 0.36914063; TotalTime = 0.32221s; TotalTimePerSample = 0.12586ms; SamplesPerSecond = 7945
- Epoch[ 1 of 4]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19309692; EvalErr[0]PerSample = 0.34765625; TotalTime = 0.30479s; TotalTimePerSample = 0.11906ms; SamplesPerSecond = 8399
- Epoch[ 1 of 4]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21521301; EvalErr[0]PerSample = 0.36679688; TotalTime = 0.29147s; TotalTimePerSample = 0.11386ms; SamplesPerSecond = 8782
- Epoch[ 1 of 4]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.24454651; EvalErr[0]PerSample = 0.37343750; TotalTime = 0.27830s; TotalTimePerSample = 0.10871ms; SamplesPerSecond = 9198
- Epoch[ 1 of 4]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26795959; EvalErr[0]PerSample = 0.38750000; TotalTime = 0.26773s; TotalTimePerSample = 0.10458ms; SamplesPerSecond = 9561
- Epoch[ 1 of 4]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22294617; EvalErr[0]PerSample = 0.38085938; TotalTime = 0.26019s; TotalTimePerSample = 0.10164ms; SamplesPerSecond = 9839
- Epoch[ 1 of 4]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18269348; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.25254s; TotalTimePerSample = 0.09865ms; SamplesPerSecond = 10137
- Epoch[ 1 of 4]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.20206299; EvalErr[0]PerSample = 0.37148437; TotalTime = 0.25223s; TotalTimePerSample = 0.09853ms; SamplesPerSecond = 10149
- Epoch[ 1 of 4]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22170105; EvalErr[0]PerSample = 0.36132813; TotalTime = 0.25269s; TotalTimePerSample = 0.09871ms; SamplesPerSecond = 10131
- Epoch[ 1 of 4]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18813477; EvalErr[0]PerSample = 0.35703125; TotalTime = 0.25354s; TotalTimePerSample = 0.09904ms; SamplesPerSecond = 10096
- Epoch[ 1 of 4]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17123108; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.31785s; TotalTimePerSample = 0.12416ms; SamplesPerSecond = 8054
- Epoch[ 1 of 4]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12467346; EvalErr[0]PerSample = 0.34843750; TotalTime = 0.36902s; TotalTimePerSample = 0.14415ms; SamplesPerSecond = 6937
- Epoch[ 1 of 4]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19338379; EvalErr[0]PerSample = 0.36210938; TotalTime = 0.34072s; TotalTimePerSample = 0.13309ms; SamplesPerSecond = 7513
- Epoch[ 1 of 4]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13827820; EvalErr[0]PerSample = 0.34648438; TotalTime = 0.34513s; TotalTimePerSample = 0.13482ms; SamplesPerSecond = 7417
- Epoch[ 1 of 4]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12540894; EvalErr[0]PerSample = 0.33710937; TotalTime = 0.37958s; TotalTimePerSample = 0.14827ms; SamplesPerSecond = 6744
- Epoch[ 1 of 4]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.10656738; EvalErr[0]PerSample = 0.33632812; TotalTime = 0.38014s; TotalTimePerSample = 0.14849ms; SamplesPerSecond = 6734
- Epoch[ 1 of 4]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13371277; EvalErr[0]PerSample = 0.34218750; TotalTime = 0.34945s; TotalTimePerSample = 0.13650ms; SamplesPerSecond = 7325
- Epoch[ 1 of 4]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12843018; EvalErr[0]PerSample = 0.34882812; TotalTime = 0.33717s; TotalTimePerSample = 0.13171ms; SamplesPerSecond = 7592
-Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 1.4094871; EvalErrPerSample = 0.4010376; Ave LearnRatePerSample = 0.003125000047; EpochTime=13.636564
+ Epoch[ 1 of 4]-Minibatch[ 101- 110 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.30881348; EvalErr[0]PerSample = 0.38476563; TotalTime = 0.38684s; TotalTimePerSample = 0.15111ms; SamplesPerSecond = 6617
+ Epoch[ 1 of 4]-Minibatch[ 111- 120 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23755188; EvalErr[0]PerSample = 0.37304688; TotalTime = 0.38140s; TotalTimePerSample = 0.14899ms; SamplesPerSecond = 6712
+ Epoch[ 1 of 4]-Minibatch[ 121- 130 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21070251; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.37976s; TotalTimePerSample = 0.14834ms; SamplesPerSecond = 6741
+ Epoch[ 1 of 4]-Minibatch[ 131- 140 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.24008789; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.35028s; TotalTimePerSample = 0.13683ms; SamplesPerSecond = 7308
+ Epoch[ 1 of 4]-Minibatch[ 141- 150 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.23422089; EvalErr[0]PerSample = 0.36835937; TotalTime = 0.32867s; TotalTimePerSample = 0.12839ms; SamplesPerSecond = 7789
+ Epoch[ 1 of 4]-Minibatch[ 151- 160 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19425964; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.36256s; TotalTimePerSample = 0.14162ms; SamplesPerSecond = 7060
+ Epoch[ 1 of 4]-Minibatch[ 161- 170 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21415710; EvalErr[0]PerSample = 0.36289063; TotalTime = 0.36688s; TotalTimePerSample = 0.14331ms; SamplesPerSecond = 6977
+ Epoch[ 1 of 4]-Minibatch[ 171- 180 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.24289856; EvalErr[0]PerSample = 0.37031250; TotalTime = 0.36730s; TotalTimePerSample = 0.14348ms; SamplesPerSecond = 6969
+ Epoch[ 1 of 4]-Minibatch[ 181- 190 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.26465454; EvalErr[0]PerSample = 0.38359375; TotalTime = 0.36054s; TotalTimePerSample = 0.14083ms; SamplesPerSecond = 7100
+ Epoch[ 1 of 4]-Minibatch[ 191- 200 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.22050476; EvalErr[0]PerSample = 0.38085938; TotalTime = 0.33458s; TotalTimePerSample = 0.13069ms; SamplesPerSecond = 7651
+ Epoch[ 1 of 4]-Minibatch[ 201- 210 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.17745056; EvalErr[0]PerSample = 0.35507813; TotalTime = 0.34743s; TotalTimePerSample = 0.13571ms; SamplesPerSecond = 7368
+ Epoch[ 1 of 4]-Minibatch[ 211- 220 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.19851379; EvalErr[0]PerSample = 0.37109375; TotalTime = 0.36452s; TotalTimePerSample = 0.14239ms; SamplesPerSecond = 7022
+ Epoch[ 1 of 4]-Minibatch[ 221- 230 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.21453857; EvalErr[0]PerSample = 0.35820313; TotalTime = 0.34422s; TotalTimePerSample = 0.13446ms; SamplesPerSecond = 7437
+ Epoch[ 1 of 4]-Minibatch[ 231- 240 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18011475; EvalErr[0]PerSample = 0.35546875; TotalTime = 0.31584s; TotalTimePerSample = 0.12337ms; SamplesPerSecond = 8105
+ Epoch[ 1 of 4]-Minibatch[ 241- 250 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.16693726; EvalErr[0]PerSample = 0.35195312; TotalTime = 0.29809s; TotalTimePerSample = 0.11644ms; SamplesPerSecond = 8588
+ Epoch[ 1 of 4]-Minibatch[ 251- 260 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12398987; EvalErr[0]PerSample = 0.35234375; TotalTime = 0.28314s; TotalTimePerSample = 0.11060ms; SamplesPerSecond = 9041
+ Epoch[ 1 of 4]-Minibatch[ 261- 270 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.18822021; EvalErr[0]PerSample = 0.36328125; TotalTime = 0.27412s; TotalTimePerSample = 0.10708ms; SamplesPerSecond = 9339
+ Epoch[ 1 of 4]-Minibatch[ 271- 280 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13831482; EvalErr[0]PerSample = 0.35078125; TotalTime = 0.28390s; TotalTimePerSample = 0.11090ms; SamplesPerSecond = 9017
+ Epoch[ 1 of 4]-Minibatch[ 281- 290 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12718811; EvalErr[0]PerSample = 0.33984375; TotalTime = 0.38089s; TotalTimePerSample = 0.14878ms; SamplesPerSecond = 6721
+ Epoch[ 1 of 4]-Minibatch[ 291- 300 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.11155701; EvalErr[0]PerSample = 0.34179688; TotalTime = 0.38982s; TotalTimePerSample = 0.15227ms; SamplesPerSecond = 6567
+ Epoch[ 1 of 4]-Minibatch[ 301- 310 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.13423157; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.38594s; TotalTimePerSample = 0.15076ms; SamplesPerSecond = 6633
+ Epoch[ 1 of 4]-Minibatch[ 311- 320 of 320]: SamplesSeen = 2560; TrainLossPerSample =  1.12716675; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.33494s; TotalTimePerSample = 0.13084ms; SamplesPerSecond = 7643
+Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 1.4082143; EvalErrPerSample = 0.4008545; Ave LearnRatePerSample = 0.003125000047; EpochTime=16.150435
 Starting Epoch 2: learning rate per sample = 0.003125  effective momentum = 0.810210 
 minibatchiterator: epoch 1: frames [81920..163840] (first utterance at frame 81920), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.20010586; EvalErr[0]PerSample = 0.36894531; TotalTime = 0.62208s; TotalTimePerSample = 0.12150ms; SamplesPerSecond = 8230
- Epoch[ 2 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.15503139; EvalErr[0]PerSample = 0.34570313; TotalTime = 0.61113s; TotalTimePerSample = 0.11936ms; SamplesPerSecond = 8377
- Epoch[ 2 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09869881; EvalErr[0]PerSample = 0.33535156; TotalTime = 0.56470s; TotalTimePerSample = 0.11029ms; SamplesPerSecond = 9066
- Epoch[ 2 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09688034; EvalErr[0]PerSample = 0.33593750; TotalTime = 0.51043s; TotalTimePerSample = 0.09969ms; SamplesPerSecond = 10030
- Epoch[ 2 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.25485992; EvalErr[0]PerSample = 0.37636719; TotalTime = 0.47167s; TotalTimePerSample = 0.09212ms; SamplesPerSecond = 10855
- Epoch[ 2 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.15979233; EvalErr[0]PerSample = 0.36191406; TotalTime = 0.44445s; TotalTimePerSample = 0.08681ms; SamplesPerSecond = 11519
- Epoch[ 2 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.14767456; EvalErr[0]PerSample = 0.34414062; TotalTime = 0.42450s; TotalTimePerSample = 0.08291ms; SamplesPerSecond = 12061
- Epoch[ 2 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08805161; EvalErr[0]PerSample = 0.33222656; TotalTime = 0.62083s; TotalTimePerSample = 0.12126ms; SamplesPerSecond = 8246
- Epoch[ 2 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09917145; EvalErr[0]PerSample = 0.33476563; TotalTime = 0.59906s; TotalTimePerSample = 0.11700ms; SamplesPerSecond = 8546
- Epoch[ 2 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06098633; EvalErr[0]PerSample = 0.32734375; TotalTime = 0.53178s; TotalTimePerSample = 0.10386ms; SamplesPerSecond = 9628
+ Epoch[ 2 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.20089607; EvalErr[0]PerSample = 0.36757812; TotalTime = 0.63575s; TotalTimePerSample = 0.12417ms; SamplesPerSecond = 8053
+ Epoch[ 2 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.15295639; EvalErr[0]PerSample = 0.34550781; TotalTime = 0.57244s; TotalTimePerSample = 0.11180ms; SamplesPerSecond = 8944
+ Epoch[ 2 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09945831; EvalErr[0]PerSample = 0.33613281; TotalTime = 0.55825s; TotalTimePerSample = 0.10903ms; SamplesPerSecond = 9171
+ Epoch[ 2 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09916496; EvalErr[0]PerSample = 0.33867188; TotalTime = 0.61763s; TotalTimePerSample = 0.12063ms; SamplesPerSecond = 8289
+ Epoch[ 2 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.17260475; EvalErr[0]PerSample = 0.36230469; TotalTime = 0.57866s; TotalTimePerSample = 0.11302ms; SamplesPerSecond = 8847
+ Epoch[ 2 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.15717964; EvalErr[0]PerSample = 0.35820313; TotalTime = 0.61745s; TotalTimePerSample = 0.12060ms; SamplesPerSecond = 8292
+ Epoch[ 2 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.14431229; EvalErr[0]PerSample = 0.34296875; TotalTime = 0.59477s; TotalTimePerSample = 0.11617ms; SamplesPerSecond = 8608
+ Epoch[ 2 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.10515747; EvalErr[0]PerSample = 0.34394531; TotalTime = 0.58508s; TotalTimePerSample = 0.11427ms; SamplesPerSecond = 8750
+ Epoch[ 2 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.15175400; EvalErr[0]PerSample = 0.35449219; TotalTime = 0.59203s; TotalTimePerSample = 0.11563ms; SamplesPerSecond = 8648
+ Epoch[ 2 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.11654053; EvalErr[0]PerSample = 0.34101562; TotalTime = 0.57091s; TotalTimePerSample = 0.11151ms; SamplesPerSecond = 8968
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 2 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.10802689; EvalErr[0]PerSample = 0.33925781; TotalTime = 0.48630s; TotalTimePerSample = 0.09498ms; SamplesPerSecond = 10528
- Epoch[ 2 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.14810791; EvalErr[0]PerSample = 0.35449219; TotalTime = 0.45579s; TotalTimePerSample = 0.08902ms; SamplesPerSecond = 11233
- Epoch[ 2 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05741577; EvalErr[0]PerSample = 0.32734375; TotalTime = 0.42856s; TotalTimePerSample = 0.08370ms; SamplesPerSecond = 11946
- Epoch[ 2 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02627869; EvalErr[0]PerSample = 0.32187500; TotalTime = 0.44359s; TotalTimePerSample = 0.08664ms; SamplesPerSecond = 11542
- Epoch[ 2 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07954559; EvalErr[0]PerSample = 0.32402344; TotalTime = 0.43958s; TotalTimePerSample = 0.08586ms; SamplesPerSecond = 11647
- Epoch[ 2 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06135712; EvalErr[0]PerSample = 0.32148437; TotalTime = 0.38011s; TotalTimePerSample = 0.07424ms; SamplesPerSecond = 13469
-Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 1.1151241; EvalErrPerSample = 0.34069824; Ave LearnRatePerSample = 0.003125000047; EpochTime=8.123637
+ Epoch[ 2 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.11851807; EvalErr[0]PerSample = 0.34472656; TotalTime = 0.58517s; TotalTimePerSample = 0.11429ms; SamplesPerSecond = 8749
+ Epoch[ 2 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.11374054; EvalErr[0]PerSample = 0.34492187; TotalTime = 0.58136s; TotalTimePerSample = 0.11355ms; SamplesPerSecond = 8806
+ Epoch[ 2 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04686737; EvalErr[0]PerSample = 0.32265625; TotalTime = 0.51156s; TotalTimePerSample = 0.09991ms; SamplesPerSecond = 10008
+ Epoch[ 2 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.02721252; EvalErr[0]PerSample = 0.32246094; TotalTime = 0.61121s; TotalTimePerSample = 0.11938ms; SamplesPerSecond = 8376
+ Epoch[ 2 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08386230; EvalErr[0]PerSample = 0.33144531; TotalTime = 0.58963s; TotalTimePerSample = 0.11516ms; SamplesPerSecond = 8683
+ Epoch[ 2 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06164856; EvalErr[0]PerSample = 0.32558594; TotalTime = 0.52979s; TotalTimePerSample = 0.10347ms; SamplesPerSecond = 9664
+Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 1.1157421; EvalErrPerSample = 0.34266359; Ave LearnRatePerSample = 0.003125000047; EpochTime=9.420168
 Starting Epoch 3: learning rate per sample = 0.003125  effective momentum = 0.810210 
 minibatchiterator: epoch 2: frames [163840..245760] (first utterance at frame 163840), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 3 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.12565956; EvalErr[0]PerSample = 0.34511719; TotalTime = 0.58787s; TotalTimePerSample = 0.11482ms; SamplesPerSecond = 8709
- Epoch[ 3 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08568897; EvalErr[0]PerSample = 0.33847656; TotalTime = 0.52737s; TotalTimePerSample = 0.10300ms; SamplesPerSecond = 9708
- Epoch[ 3 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08227139; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.48594s; TotalTimePerSample = 0.09491ms; SamplesPerSecond = 10536
- Epoch[ 3 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09552345; EvalErr[0]PerSample = 0.33769531; TotalTime = 0.45227s; TotalTimePerSample = 0.08833ms; SamplesPerSecond = 11320
- Epoch[ 3 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07601204; EvalErr[0]PerSample = 0.33691406; TotalTime = 0.42756s; TotalTimePerSample = 0.08351ms; SamplesPerSecond = 11975
- Epoch[ 3 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05121803; EvalErr[0]PerSample = 0.33046875; TotalTime = 0.42206s; TotalTimePerSample = 0.08243ms; SamplesPerSecond = 12131
- Epoch[ 3 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.09072342; EvalErr[0]PerSample = 0.33359375; TotalTime = 0.41994s; TotalTimePerSample = 0.08202ms; SamplesPerSecond = 12192
- Epoch[ 3 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07286148; EvalErr[0]PerSample = 0.32265625; TotalTime = 0.42157s; TotalTimePerSample = 0.08234ms; SamplesPerSecond = 12145
- Epoch[ 3 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04907379; EvalErr[0]PerSample = 0.32558594; TotalTime = 0.42022s; TotalTimePerSample = 0.08207ms; SamplesPerSecond = 12184
- Epoch[ 3 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05518036; EvalErr[0]PerSample = 0.32714844; TotalTime = 0.42096s; TotalTimePerSample = 0.08222ms; SamplesPerSecond = 12162
+ Epoch[ 3 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.12331724; EvalErr[0]PerSample = 0.34121094; TotalTime = 0.60252s; TotalTimePerSample = 0.11768ms; SamplesPerSecond = 8497
+ Epoch[ 3 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07871103; EvalErr[0]PerSample = 0.33652344; TotalTime = 0.61255s; TotalTimePerSample = 0.11964ms; SamplesPerSecond = 8358
+ Epoch[ 3 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06784973; EvalErr[0]PerSample = 0.33183594; TotalTime = 0.56505s; TotalTimePerSample = 0.11036ms; SamplesPerSecond = 9061
+ Epoch[ 3 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08440666; EvalErr[0]PerSample = 0.33398438; TotalTime = 0.55108s; TotalTimePerSample = 0.10763ms; SamplesPerSecond = 9290
+ Epoch[ 3 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.07466774; EvalErr[0]PerSample = 0.33320312; TotalTime = 0.58281s; TotalTimePerSample = 0.11383ms; SamplesPerSecond = 8785
+ Epoch[ 3 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05427513; EvalErr[0]PerSample = 0.33125000; TotalTime = 0.59333s; TotalTimePerSample = 0.11589ms; SamplesPerSecond = 8629
+ Epoch[ 3 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06873093; EvalErr[0]PerSample = 0.32773438; TotalTime = 0.60744s; TotalTimePerSample = 0.11864ms; SamplesPerSecond = 8428
+ Epoch[ 3 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08097610; EvalErr[0]PerSample = 0.33007813; TotalTime = 0.53753s; TotalTimePerSample = 0.10499ms; SamplesPerSecond = 9525
+ Epoch[ 3 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05431290; EvalErr[0]PerSample = 0.32792969; TotalTime = 0.48923s; TotalTimePerSample = 0.09555ms; SamplesPerSecond = 10465
+ Epoch[ 3 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06173096; EvalErr[0]PerSample = 0.32695313; TotalTime = 0.45004s; TotalTimePerSample = 0.08790ms; SamplesPerSecond = 11376
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 3 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.03641891; EvalErr[0]PerSample = 0.32148437; TotalTime = 0.42161s; TotalTimePerSample = 0.08235ms; SamplesPerSecond = 12143
- Epoch[ 3 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08370361; EvalErr[0]PerSample = 0.33710937; TotalTime = 0.42014s; TotalTimePerSample = 0.08206ms; SamplesPerSecond = 12186
- Epoch[ 3 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.10984344; EvalErr[0]PerSample = 0.33164063; TotalTime = 0.42099s; TotalTimePerSample = 0.08222ms; SamplesPerSecond = 12161
- Epoch[ 3 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06258087; EvalErr[0]PerSample = 0.32714844; TotalTime = 0.42017s; TotalTimePerSample = 0.08206ms; SamplesPerSecond = 12185
- Epoch[ 3 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.06584320; EvalErr[0]PerSample = 0.33671875; TotalTime = 0.42112s; TotalTimePerSample = 0.08225ms; SamplesPerSecond = 12158
- Epoch[ 3 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05016174; EvalErr[0]PerSample = 0.33183594; TotalTime = 0.41651s; TotalTimePerSample = 0.08135ms; SamplesPerSecond = 12292
-Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 1.0745478; EvalErrPerSample = 0.33234864; Ave LearnRatePerSample = 0.003125000047; EpochTime=7.18526
+ Epoch[ 3 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04505692; EvalErr[0]PerSample = 0.32792969; TotalTime = 0.42087s; TotalTimePerSample = 0.08220ms; SamplesPerSecond = 12165
+ Epoch[ 3 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.08151245; EvalErr[0]PerSample = 0.33574219; TotalTime = 0.51025s; TotalTimePerSample = 0.09966ms; SamplesPerSecond = 10034
+ Epoch[ 3 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.10628204; EvalErr[0]PerSample = 0.33437500; TotalTime = 0.61115s; TotalTimePerSample = 0.11936ms; SamplesPerSecond = 8377
+ Epoch[ 3 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05827026; EvalErr[0]PerSample = 0.32636719; TotalTime = 0.61738s; TotalTimePerSample = 0.12058ms; SamplesPerSecond = 8293
+ Epoch[ 3 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.05841064; EvalErr[0]PerSample = 0.33574219; TotalTime = 0.62912s; TotalTimePerSample = 0.12287ms; SamplesPerSecond = 8138
+ Epoch[ 3 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04437714; EvalErr[0]PerSample = 0.32773438; TotalTime = 0.53966s; TotalTimePerSample = 0.10540ms; SamplesPerSecond = 9487
+Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 1.0714306; EvalErrPerSample = 0.33178711; Ave LearnRatePerSample = 0.003125000047; EpochTime=9.000243
 Starting Epoch 4: learning rate per sample = 0.003125  effective momentum = 0.810210 
 minibatchiterator: epoch 3: frames [245760..327680] (first utterance at frame 245760), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 4 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04058170; EvalErr[0]PerSample = 0.32382813; TotalTime = 0.61361s; TotalTimePerSample = 0.11985ms; SamplesPerSecond = 8344
- Epoch[ 4 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 4926; TrainLossPerSample =  1.03329491; EvalErr[0]PerSample = 0.31465692; TotalTime = 1.68883s; TotalTimePerSample = 0.34284ms; SamplesPerSecond = 2916
- Epoch[ 4 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01529274; EvalErr[0]PerSample = 0.31835938; TotalTime = 0.45719s; TotalTimePerSample = 0.08929ms; SamplesPerSecond = 11198
- Epoch[ 4 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00803413; EvalErr[0]PerSample = 0.31679687; TotalTime = 0.42910s; TotalTimePerSample = 0.08381ms; SamplesPerSecond = 11932
- Epoch[ 4 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01046181; EvalErr[0]PerSample = 0.31894531; TotalTime = 0.42059s; TotalTimePerSample = 0.08215ms; SamplesPerSecond = 12173
- Epoch[ 4 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99893723; EvalErr[0]PerSample = 0.31367187; TotalTime = 0.42160s; TotalTimePerSample = 0.08234ms; SamplesPerSecond = 12144
- Epoch[ 4 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99259148; EvalErr[0]PerSample = 0.30644531; TotalTime = 0.42045s; TotalTimePerSample = 0.08212ms; SamplesPerSecond = 12177
- Epoch[ 4 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01776657; EvalErr[0]PerSample = 0.31914063; TotalTime = 0.42189s; TotalTimePerSample = 0.08240ms; SamplesPerSecond = 12135
- Epoch[ 4 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99872665; EvalErr[0]PerSample = 0.31503906; TotalTime = 0.42067s; TotalTimePerSample = 0.08216ms; SamplesPerSecond = 12171
- Epoch[ 4 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.97249756; EvalErr[0]PerSample = 0.31191406; TotalTime = 0.42106s; TotalTimePerSample = 0.08224ms; SamplesPerSecond = 12159
+ Epoch[ 4 of 4]-Minibatch[   1-  10 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.04450397; EvalErr[0]PerSample = 0.33125000; TotalTime = 0.60059s; TotalTimePerSample = 0.11730ms; SamplesPerSecond = 8524
+ Epoch[ 4 of 4]-Minibatch[  11-  20 of 160]: SamplesSeen = 4926; TrainLossPerSample =  1.02895867; EvalErr[0]PerSample = 0.31567194; TotalTime = 1.93158s; TotalTimePerSample = 0.39212ms; SamplesPerSecond = 2550
+ Epoch[ 4 of 4]-Minibatch[  21-  30 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00198059; EvalErr[0]PerSample = 0.31601563; TotalTime = 0.56293s; TotalTimePerSample = 0.10995ms; SamplesPerSecond = 9095
+ Epoch[ 4 of 4]-Minibatch[  31-  40 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00561543; EvalErr[0]PerSample = 0.31777344; TotalTime = 0.59339s; TotalTimePerSample = 0.11590ms; SamplesPerSecond = 8628
+ Epoch[ 4 of 4]-Minibatch[  41-  50 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00148926; EvalErr[0]PerSample = 0.31601563; TotalTime = 0.61272s; TotalTimePerSample = 0.11967ms; SamplesPerSecond = 8356
+ Epoch[ 4 of 4]-Minibatch[  51-  60 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00593376; EvalErr[0]PerSample = 0.31406250; TotalTime = 0.53038s; TotalTimePerSample = 0.10359ms; SamplesPerSecond = 9653
+ Epoch[ 4 of 4]-Minibatch[  61-  70 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.98752327; EvalErr[0]PerSample = 0.30722656; TotalTime = 0.48194s; TotalTimePerSample = 0.09413ms; SamplesPerSecond = 10623
+ Epoch[ 4 of 4]-Minibatch[  71-  80 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01428757; EvalErr[0]PerSample = 0.31992188; TotalTime = 0.44727s; TotalTimePerSample = 0.08736ms; SamplesPerSecond = 11447
+ Epoch[ 4 of 4]-Minibatch[  81-  90 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99691544; EvalErr[0]PerSample = 0.31621094; TotalTime = 0.48183s; TotalTimePerSample = 0.09411ms; SamplesPerSecond = 10626
+ Epoch[ 4 of 4]-Minibatch[  91- 100 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.96604996; EvalErr[0]PerSample = 0.30937500; TotalTime = 0.59759s; TotalTimePerSample = 0.11672ms; SamplesPerSecond = 8567
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 4 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99847946; EvalErr[0]PerSample = 0.30937500; TotalTime = 0.42112s; TotalTimePerSample = 0.08225ms; SamplesPerSecond = 12157
- Epoch[ 4 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99825592; EvalErr[0]PerSample = 0.30859375; TotalTime = 0.42117s; TotalTimePerSample = 0.08226ms; SamplesPerSecond = 12156
- Epoch[ 4 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.01118851; EvalErr[0]PerSample = 0.31523438; TotalTime = 0.42075s; TotalTimePerSample = 0.08218ms; SamplesPerSecond = 12168
- Epoch[ 4 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99189148; EvalErr[0]PerSample = 0.31132813; TotalTime = 0.42094s; TotalTimePerSample = 0.08222ms; SamplesPerSecond = 12163
- Epoch[ 4 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.95366974; EvalErr[0]PerSample = 0.30312500; TotalTime = 0.42195s; TotalTimePerSample = 0.08241ms; SamplesPerSecond = 12134
- Epoch[ 4 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99163055; EvalErr[0]PerSample = 0.31074219; TotalTime = 0.39088s; TotalTimePerSample = 0.07634ms; SamplesPerSecond = 13098
-Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 1.0018568; EvalErrPerSample = 0.31358644; Ave LearnRatePerSample = 0.003125000047; EpochTime=8.314413
+ Epoch[ 4 of 4]-Minibatch[ 101- 110 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99062958; EvalErr[0]PerSample = 0.30527344; TotalTime = 0.58136s; TotalTimePerSample = 0.11355ms; SamplesPerSecond = 8806
+ Epoch[ 4 of 4]-Minibatch[ 111- 120 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.99886856; EvalErr[0]PerSample = 0.30976562; TotalTime = 0.57362s; TotalTimePerSample = 0.11203ms; SamplesPerSecond = 8925
+ Epoch[ 4 of 4]-Minibatch[ 121- 130 of 160]: SamplesSeen = 5120; TrainLossPerSample =  1.00958328; EvalErr[0]PerSample = 0.31523438; TotalTime = 0.60384s; TotalTimePerSample = 0.11794ms; SamplesPerSecond = 8479
+ Epoch[ 4 of 4]-Minibatch[ 131- 140 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.97942047; EvalErr[0]PerSample = 0.31171875; TotalTime = 0.60621s; TotalTimePerSample = 0.11840ms; SamplesPerSecond = 8445
+ Epoch[ 4 of 4]-Minibatch[ 141- 150 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.94226837; EvalErr[0]PerSample = 0.30136719; TotalTime = 0.60218s; TotalTimePerSample = 0.11761ms; SamplesPerSecond = 8502
+ Epoch[ 4 of 4]-Minibatch[ 151- 160 of 160]: SamplesSeen = 5120; TrainLossPerSample =  0.96711578; EvalErr[0]PerSample = 0.30175781; TotalTime = 0.49045s; TotalTimePerSample = 0.09579ms; SamplesPerSecond = 10439
+Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 0.99611807; EvalErrPerSample = 0.31303713; Ave LearnRatePerSample = 0.003125000047; EpochTime=10.396508
 CNTKCommandTrainEnd: speechTrain
 COMPLETED
diff --git a/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml b/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml
index b7b5d9b27..b48f31b15 100644
--- a/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml
+++ b/Tests/Speech/DNN/DiscriminativePreTraining/testcases.yml
@@ -3,11 +3,7 @@ tags:
      # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
      - bvt-s  (flavor=='debug') ^ (device=='cpu')
      # running unconditionally on every Nightly job in 'S' leg
-     # TODO: Temporary disabling Release-GPU because of a known bug causing large variance between
-     # Release and Debug configurations for GPU only for this (Speech/DNN/DiscriminativePreTraining) test. 
-     # This will be re-enabled after the bug has been addressed. 
-     # DO NOT COPY this disablement for other tests!!
-     - nightly-s  (flavor!='release') or (device!='gpu')
+     - nightly-s
 
 testCases:
   CNTK Run must be completed:
diff --git a/Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt b/Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt
index d77e6b08c..4bafb9779 100644
--- a/Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt
+++ b/Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt
@@ -1,40 +1,37 @@
-=== Running mpiexec -n 3 /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+=== Running mpiexec -n 3 /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. DeviceId=0 stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPIWrapper: initializing MPI
 MPIWrapper: initializing MPI
 MPIWrapper: initializing MPI
 ping [requestnodes (before change)]: 3 nodes pinging each other
-ping [requestnodes (before change)]: 3 nodes pinging each other
-ping [requestnodes (before change)]: 3 nodes pinging each other
-ping [requestnodes (before change)]: all 3 nodes responded
-requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (1) are in (participating)
-ping [requestnodes (after change)]: 3 nodes pinging each other
-ping [requestnodes (before change)]: all 3 nodes responded
-requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (0) are in (participating)
-ping [requestnodes (after change)]: 3 nodes pinging each other
 ping [requestnodes (before change)]: all 3 nodes responded
 requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (2) are in (participating)
 ping [requestnodes (after change)]: 3 nodes pinging each other
 ping [requestnodes (after change)]: all 3 nodes responded
-mpihelper: we are cog 0 in a gearbox of 3
-ping [mpihelper]: 3 nodes pinging each other
-ping [requestnodes (after change)]: all 3 nodes responded
-mpihelper: we are cog 1 in a gearbox of 3
-ping [mpihelper]: 3 nodes pinging each other
-ping [requestnodes (after change)]: all 3 nodes responded
 mpihelper: we are cog 2 in a gearbox of 3
 ping [mpihelper]: 3 nodes pinging each other
 ping [mpihelper]: all 3 nodes responded
+ping [requestnodes (before change)]: 3 nodes pinging each other
+ping [requestnodes (before change)]: all 3 nodes responded
+requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (0) are in (participating)
+ping [requestnodes (after change)]: 3 nodes pinging each other
+ping [requestnodes (after change)]: all 3 nodes responded
+mpihelper: we are cog 0 in a gearbox of 3
+ping [mpihelper]: 3 nodes pinging each other
 ping [mpihelper]: all 3 nodes responded
+ping [requestnodes (before change)]: 3 nodes pinging each other
+ping [requestnodes (before change)]: all 3 nodes responded
+requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (1) are in (participating)
+ping [requestnodes (after change)]: 3 nodes pinging each other
+ping [requestnodes (after change)]: all 3 nodes responded
+mpihelper: we are cog 1 in a gearbox of 3
+ping [mpihelper]: 3 nodes pinging each other
 ping [mpihelper]: all 3 nodes responded
-Redirecting stderr to file /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank0
-CNTKModelPath: /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
-Redirecting stderr to file /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank1
-CNTKModelPath: /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
-Redirecting stderr to file /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank2
-CNTKModelPath: /tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
-MPI Rank 0: running on localhost at 2015/10/02 13:38:52
-MPI Rank 0: command line options: 
-MPI Rank 0: configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr 
+Redirecting stderr to file /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank0
+Redirecting stderr to file /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank1
+Redirecting stderr to file /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank2
+MPI Rank 0: running on localhost at 2015/10/24 12:56:11
+MPI Rank 0: command line: 
+MPI Rank 0: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. DeviceId=0 stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr 
 MPI Rank 0: 
 MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 0: precision=float
@@ -126,10 +123,11 @@ MPI Rank 0:           labelType=Category
 MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ]
-MPI Rank 0: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 0: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 0: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+MPI Rank 0: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 0: DeviceId=0
-MPI Rank 0: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 0: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 0: 
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: 
@@ -140,7 +138,7 @@ MPI Rank 0: deviceId=0
 MPI Rank 0: parallelTrain=true
 MPI Rank 0: speechTrain=[
 MPI Rank 0:     action=train
-MPI Rank 0:     modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 0:     modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 0:     deviceId=0
 MPI Rank 0:     traceLevel=1
 MPI Rank 0:     SimpleNetworkBuilder=[
@@ -223,23 +221,25 @@ MPI Rank 0:           labelType=Category
 MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ]
-MPI Rank 0: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 0: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 0: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+MPI Rank 0: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 0: DeviceId=0
-MPI Rank 0: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 0: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 0: 
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: 
 MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 0: configparameters: cntk.config:command=speechTrain
+MPI Rank 0: configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 0: configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
 MPI Rank 0: configparameters: cntk.config:deviceId=0
 MPI Rank 0: configparameters: cntk.config:parallelTrain=true
 MPI Rank 0: configparameters: cntk.config:precision=float
-MPI Rank 0: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 0: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 0: configparameters: cntk.config:speechTrain=[
 MPI Rank 0:     action=train
-MPI Rank 0:     modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 0:     modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 0:     deviceId=0
 MPI Rank 0:     traceLevel=1
 MPI Rank 0:     SimpleNetworkBuilder=[
@@ -323,10 +323,11 @@ MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ]
 MPI Rank 0: 
-MPI Rank 0: configparameters: cntk.config:stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 0: configparameters: cntk.config:stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: command: speechTrain 
 MPI Rank 0: precision = float
+MPI Rank 0: CNTKModelPath: /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 0: CNTKCommandTrainInfo: speechTrain : 3
 MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
 MPI Rank 0: CNTKCommandTrainBegin: speechTrain
@@ -338,11 +339,12 @@ MPI Rank 0: htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Sp
 MPI Rank 0: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 MPI Rank 0: label set 0: 129 classes
 MPI Rank 0: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 0: GetTrainCriterionNodes  ...
 MPI Rank 0: GetEvalCriterionNodes  ...
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node CrossEntropyWithSoftmax 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 0: 
 MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
 MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
@@ -365,13 +367,57 @@ MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
 MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
 MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
 MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 0: 
 MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 0: Found 6 PreCompute nodes
-MPI Rank 0: 	NodeName: InvStdOfFeatures
-MPI Rank 0: 	NodeName: MeanOfFeatures
-MPI Rank 0: 	NodeName: Prior
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 0: 
 MPI Rank 0: 	NodeName: InvStdOfFeatures
 MPI Rank 0: 	NodeName: MeanOfFeatures
 MPI Rank 0: 	NodeName: Prior
@@ -379,126 +425,192 @@ MPI Rank 0: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f
 MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node InvStdOfFeatures 
+MPI Rank 0: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 0: 
-MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node InvStdOfFeatures, final verification.
 MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 0: 
 MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node MeanOfFeatures 
 MPI Rank 0: 
-MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 0: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node MeanOfFeatures, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 0: 
 MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node Prior 
 MPI Rank 0: 
-MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 64]
-MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1]
+MPI Rank 0: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node Prior, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 0: 
 MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+MPI Rank 0: 
+MPI Rank 0: Precomputing --> Completed.
+MPI Rank 0: 
 MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
+MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000 
 MPI Rank 0: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 3, with 1 datapasses
 MPI Rank 0: 
-MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
 MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
 MPI Rank 0: 
-MPI Rank 0: Validating node EvalErrorPrediction 
-MPI Rank 0: 
-MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 29]
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62]
 MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
 MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
 MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
-MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 29]
-MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 29]) -> [363, 1]
-MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 29]) -> [363, 1]
-MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 29], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 29]
-MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 29]) -> [512, MBSize 29]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
-MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 29], B0[512, 1]) -> [512, MBSize 29]
-MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 29]) -> [512, MBSize 29]
-MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 29]) -> [512, MBSize 29]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
-MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 29], B1[512, 1]) -> [512, MBSize 29]
-MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 29]) -> [512, MBSize 29]
-MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 29]) -> [132, MBSize 29]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
 MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
-MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 29], B2[132, 1]) -> [132, MBSize 29]
-MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 29], HLast[132, MBSize 29]) -> [1, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
 MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 0: 
 MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.32135414; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.25642s; TotalTimePerSample = 0.40065ms; SamplesPerSecond = 2495
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.15070930; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.24686s; TotalTimePerSample = 0.38571ms; SamplesPerSecond = 2592
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.99901060; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24575s; TotalTimePerSample = 0.38398ms; SamplesPerSecond = 2604
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.86945780; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24696s; TotalTimePerSample = 0.38588ms; SamplesPerSecond = 2591
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.80219517; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.24516s; TotalTimePerSample = 0.38307ms; SamplesPerSecond = 2610
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72890717; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.24462s; TotalTimePerSample = 0.38221ms; SamplesPerSecond = 2616
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.56187025; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.24416s; TotalTimePerSample = 0.38150ms; SamplesPerSecond = 2621
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.42790310; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.24566s; TotalTimePerSample = 0.38384ms; SamplesPerSecond = 2605
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33928303; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.24437s; TotalTimePerSample = 0.38184ms; SamplesPerSecond = 2618
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.36398734; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.24545s; TotalTimePerSample = 0.38352ms; SamplesPerSecond = 2607
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.21223679; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.24567s; TotalTimePerSample = 0.38385ms; SamplesPerSecond = 2605
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.31265333; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.24655s; TotalTimePerSample = 0.38523ms; SamplesPerSecond = 2595
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.14081673; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.24650s; TotalTimePerSample = 0.38515ms; SamplesPerSecond = 2596
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00690023; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.24591s; TotalTimePerSample = 0.38424ms; SamplesPerSecond = 2602
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00496087; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.24734s; TotalTimePerSample = 0.38648ms; SamplesPerSecond = 2587
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.97859121; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.24469s; TotalTimePerSample = 0.38233ms; SamplesPerSecond = 2615
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.85686638; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.24529s; TotalTimePerSample = 0.38327ms; SamplesPerSecond = 2609
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.24514s; TotalTimePerSample = 0.38303ms; SamplesPerSecond = 2610
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.78653366; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.24606s; TotalTimePerSample = 0.38447ms; SamplesPerSecond = 2600
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.57702529; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24668s; TotalTimePerSample = 0.38543ms; SamplesPerSecond = 2594
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.61570793; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.24435s; TotalTimePerSample = 0.38180ms; SamplesPerSecond = 2619
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.55235603; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.24639s; TotalTimePerSample = 0.38499ms; SamplesPerSecond = 2597
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.48211165; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.24605s; TotalTimePerSample = 0.38446ms; SamplesPerSecond = 2601
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.38778376; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.24591s; TotalTimePerSample = 0.38423ms; SamplesPerSecond = 2602
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.36900911; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24475s; TotalTimePerSample = 0.38243ms; SamplesPerSecond = 2614
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.43967760; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.24451s; TotalTimePerSample = 0.38205ms; SamplesPerSecond = 2617
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30281011; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.24558s; TotalTimePerSample = 0.38371ms; SamplesPerSecond = 2606
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.19669121; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.24470s; TotalTimePerSample = 0.38235ms; SamplesPerSecond = 2615
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.28979560; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.24495s; TotalTimePerSample = 0.38273ms; SamplesPerSecond = 2612
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.17750506; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.24520s; TotalTimePerSample = 0.38313ms; SamplesPerSecond = 2610
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.26264305; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.24493s; TotalTimePerSample = 0.38270ms; SamplesPerSecond = 2613
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15073149; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.24465s; TotalTimePerSample = 0.38226ms; SamplesPerSecond = 2616
-MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799568; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.871485
-MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.32135295; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.24478s; TotalTimePerSample = 0.38247ms; SamplesPerSecond = 2614
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.15070941; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.23448s; TotalTimePerSample = 0.36637ms; SamplesPerSecond = 2729
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.99901066; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22704s; TotalTimePerSample = 0.35475ms; SamplesPerSecond = 2818
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.86945816; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22872s; TotalTimePerSample = 0.35738ms; SamplesPerSecond = 2798
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.80219557; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.22743s; TotalTimePerSample = 0.35535ms; SamplesPerSecond = 2814
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72890766; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.22625s; TotalTimePerSample = 0.35352ms; SamplesPerSecond = 2828
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.56187065; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.22749s; TotalTimePerSample = 0.35546ms; SamplesPerSecond = 2813
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.42790299; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.22725s; TotalTimePerSample = 0.35508ms; SamplesPerSecond = 2816
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33928338; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.22716s; TotalTimePerSample = 0.35493ms; SamplesPerSecond = 2817
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.36398772; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.22771s; TotalTimePerSample = 0.35580ms; SamplesPerSecond = 2810
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.21223693; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.22768s; TotalTimePerSample = 0.35575ms; SamplesPerSecond = 2810
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.31265357; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.22736s; TotalTimePerSample = 0.35525ms; SamplesPerSecond = 2814
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.14081698; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.22896s; TotalTimePerSample = 0.35775ms; SamplesPerSecond = 2795
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00690035; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.22665s; TotalTimePerSample = 0.35414ms; SamplesPerSecond = 2823
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00496066; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.22970s; TotalTimePerSample = 0.35891ms; SamplesPerSecond = 2786
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.97859081; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.22699s; TotalTimePerSample = 0.35468ms; SamplesPerSecond = 2819
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.85686609; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.22767s; TotalTimePerSample = 0.35574ms; SamplesPerSecond = 2811
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.22778s; TotalTimePerSample = 0.35590ms; SamplesPerSecond = 2809
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.78653376; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.22753s; TotalTimePerSample = 0.35551ms; SamplesPerSecond = 2812
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.57702533; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22747s; TotalTimePerSample = 0.35542ms; SamplesPerSecond = 2813
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.61570805; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.22869s; TotalTimePerSample = 0.35733ms; SamplesPerSecond = 2798
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.55235582; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.22822s; TotalTimePerSample = 0.35660ms; SamplesPerSecond = 2804
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.48211151; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.22783s; TotalTimePerSample = 0.35599ms; SamplesPerSecond = 2809
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.38778372; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.22805s; TotalTimePerSample = 0.35633ms; SamplesPerSecond = 2806
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.36900902; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22808s; TotalTimePerSample = 0.35637ms; SamplesPerSecond = 2806
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.43967781; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.22820s; TotalTimePerSample = 0.35656ms; SamplesPerSecond = 2804
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30281039; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.22746s; TotalTimePerSample = 0.35541ms; SamplesPerSecond = 2813
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.19669146; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.22798s; TotalTimePerSample = 0.35621ms; SamplesPerSecond = 2807
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.28979581; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.22790s; TotalTimePerSample = 0.35610ms; SamplesPerSecond = 2808
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.17750535; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.22741s; TotalTimePerSample = 0.35532ms; SamplesPerSecond = 2814
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.26264398; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.22641s; TotalTimePerSample = 0.35377ms; SamplesPerSecond = 2826
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15073110; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.22676s; TotalTimePerSample = 0.35431ms; SamplesPerSecond = 2822
+MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799569; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.319218
+MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119 
 MPI Rank 0: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 3, with 1 datapasses
 MPI Rank 0: 
 MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.28942s; TotalTimePerSample = 0.11306ms; SamplesPerSecond = 8845
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98818586; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.28095s; TotalTimePerSample = 0.10975ms; SamplesPerSecond = 9111
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98698123; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.28222s; TotalTimePerSample = 0.11024ms; SamplesPerSecond = 9071
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.93126298; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.27954s; TotalTimePerSample = 0.10920ms; SamplesPerSecond = 9157
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90067741; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.27987s; TotalTimePerSample = 0.10933ms; SamplesPerSecond = 9146
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.97115807; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.27910s; TotalTimePerSample = 0.10903ms; SamplesPerSecond = 9172
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89518067; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.27764s; TotalTimePerSample = 0.10846ms; SamplesPerSecond = 9220
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90450396; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.27504s; TotalTimePerSample = 0.10744ms; SamplesPerSecond = 9307
-MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=2.248479
-MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.01598514; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25250s; TotalTimePerSample = 0.09863ms; SamplesPerSecond = 10138
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98818590; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.24831s; TotalTimePerSample = 0.09700ms; SamplesPerSecond = 10309
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98698122; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25203s; TotalTimePerSample = 0.09845ms; SamplesPerSecond = 10157
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.93126295; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.25003s; TotalTimePerSample = 0.09767ms; SamplesPerSecond = 10238
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90067743; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.24767s; TotalTimePerSample = 0.09675ms; SamplesPerSecond = 10336
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.97115808; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25090s; TotalTimePerSample = 0.09801ms; SamplesPerSecond = 10203
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89518061; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.24830s; TotalTimePerSample = 0.09699ms; SamplesPerSecond = 10309
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90450394; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.24457s; TotalTimePerSample = 0.09553ms; SamplesPerSecond = 10467
+MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=1.999658
+MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
 MPI Rank 0: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 3, with 1 datapasses
 MPI Rank 0: 
 MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 0:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87359851; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.44176s; TotalTimePerSample = 0.04314ms; SamplesPerSecond = 23179
-MPI Rank 0:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86656277; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.42652s; TotalTimePerSample = 0.04165ms; SamplesPerSecond = 24008
-MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.881328
+MPI Rank 0:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87359841; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.36717s; TotalTimePerSample = 0.03586ms; SamplesPerSecond = 27888
+MPI Rank 0:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86656271; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.34559s; TotalTimePerSample = 0.03375ms; SamplesPerSecond = 29630
+MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.728516
 MPI Rank 0: CNTKCommandTrainEnd: speechTrain
 MPI Rank 0: COMPLETED
 MPI Rank 0: ~MPIWrapper
-MPI Rank 1: running on localhost at 2015/10/02 13:38:53
-MPI Rank 1: command line options: 
-MPI Rank 1: configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr 
+MPI Rank 1: running on localhost at 2015/10/24 12:56:12
+MPI Rank 1: command line: 
+MPI Rank 1: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. DeviceId=0 stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr 
 MPI Rank 1: 
 MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 1: precision=float
@@ -590,10 +702,11 @@ MPI Rank 1:           labelType=Category
 MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ]
-MPI Rank 1: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 1: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 1: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+MPI Rank 1: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 1: DeviceId=0
-MPI Rank 1: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 1: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 1: 
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: 
@@ -604,7 +717,7 @@ MPI Rank 1: deviceId=0
 MPI Rank 1: parallelTrain=true
 MPI Rank 1: speechTrain=[
 MPI Rank 1:     action=train
-MPI Rank 1:     modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 1:     modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 1:     deviceId=0
 MPI Rank 1:     traceLevel=1
 MPI Rank 1:     SimpleNetworkBuilder=[
@@ -687,23 +800,25 @@ MPI Rank 1:           labelType=Category
 MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ]
-MPI Rank 1: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 1: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 1: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+MPI Rank 1: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 1: DeviceId=0
-MPI Rank 1: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 1: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 1: 
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: 
 MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 1: configparameters: cntk.config:command=speechTrain
+MPI Rank 1: configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 1: configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
 MPI Rank 1: configparameters: cntk.config:deviceId=0
 MPI Rank 1: configparameters: cntk.config:parallelTrain=true
 MPI Rank 1: configparameters: cntk.config:precision=float
-MPI Rank 1: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 1: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 1: configparameters: cntk.config:speechTrain=[
 MPI Rank 1:     action=train
-MPI Rank 1:     modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 1:     modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 1:     deviceId=0
 MPI Rank 1:     traceLevel=1
 MPI Rank 1:     SimpleNetworkBuilder=[
@@ -787,10 +902,11 @@ MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ]
 MPI Rank 1: 
-MPI Rank 1: configparameters: cntk.config:stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 1: configparameters: cntk.config:stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: command: speechTrain 
 MPI Rank 1: precision = float
+MPI Rank 1: CNTKModelPath: /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 1: CNTKCommandTrainInfo: speechTrain : 3
 MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
 MPI Rank 1: CNTKCommandTrainBegin: speechTrain
@@ -802,11 +918,12 @@ MPI Rank 1: htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Sp
 MPI Rank 1: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 MPI Rank 1: label set 0: 129 classes
 MPI Rank 1: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 1: GetTrainCriterionNodes  ...
 MPI Rank 1: GetEvalCriterionNodes  ...
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node CrossEntropyWithSoftmax 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 1: 
 MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
 MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
@@ -829,13 +946,57 @@ MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
 MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
 MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
 MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 1: 
 MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 1: Found 6 PreCompute nodes
-MPI Rank 1: 	NodeName: InvStdOfFeatures
-MPI Rank 1: 	NodeName: MeanOfFeatures
-MPI Rank 1: 	NodeName: Prior
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 1: 
 MPI Rank 1: 	NodeName: InvStdOfFeatures
 MPI Rank 1: 	NodeName: MeanOfFeatures
 MPI Rank 1: 	NodeName: Prior
@@ -843,126 +1004,192 @@ MPI Rank 1: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f
 MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node InvStdOfFeatures 
+MPI Rank 1: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 1: 
-MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node InvStdOfFeatures, final verification.
 MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 1: 
 MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node MeanOfFeatures 
 MPI Rank 1: 
-MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 1: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node MeanOfFeatures, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 1: 
 MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node Prior 
 MPI Rank 1: 
-MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 64]
-MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1]
+MPI Rank 1: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node Prior, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 1: 
 MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+MPI Rank 1: 
+MPI Rank 1: Precomputing --> Completed.
+MPI Rank 1: 
 MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
+MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000 
 MPI Rank 1: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 1 of 3, with 1 datapasses
 MPI Rank 1: 
-MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
 MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
 MPI Rank 1: 
-MPI Rank 1: Validating node EvalErrorPrediction 
-MPI Rank 1: 
-MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 28]
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62]
 MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
 MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
 MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
-MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 28]
-MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 28]) -> [363, 1]
-MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 28]) -> [363, 1]
-MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 28], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 28]
-MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 28]) -> [512, MBSize 28]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
-MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 28], B0[512, 1]) -> [512, MBSize 28]
-MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 28]) -> [512, MBSize 28]
-MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 28]) -> [512, MBSize 28]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
-MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 28], B1[512, 1]) -> [512, MBSize 28]
-MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 28]) -> [512, MBSize 28]
-MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 28]) -> [132, MBSize 28]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
 MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
-MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 28], B2[132, 1]) -> [132, MBSize 28]
-MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 28], HLast[132, MBSize 28]) -> [1, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
 MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 1: 
 MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.32135414; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.25660s; TotalTimePerSample = 0.40093ms; SamplesPerSecond = 2494
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.15070930; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.24685s; TotalTimePerSample = 0.38571ms; SamplesPerSecond = 2592
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.99901060; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24575s; TotalTimePerSample = 0.38398ms; SamplesPerSecond = 2604
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.86945780; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24696s; TotalTimePerSample = 0.38587ms; SamplesPerSecond = 2591
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.80219517; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.24515s; TotalTimePerSample = 0.38305ms; SamplesPerSecond = 2610
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72890717; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.24461s; TotalTimePerSample = 0.38220ms; SamplesPerSecond = 2616
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.56187025; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.24416s; TotalTimePerSample = 0.38149ms; SamplesPerSecond = 2621
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.42790310; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.24565s; TotalTimePerSample = 0.38382ms; SamplesPerSecond = 2605
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33928303; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.24435s; TotalTimePerSample = 0.38179ms; SamplesPerSecond = 2619
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.36398734; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.24545s; TotalTimePerSample = 0.38352ms; SamplesPerSecond = 2607
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.21223679; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.24566s; TotalTimePerSample = 0.38384ms; SamplesPerSecond = 2605
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.31265333; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.24655s; TotalTimePerSample = 0.38523ms; SamplesPerSecond = 2595
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.14081673; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.24648s; TotalTimePerSample = 0.38513ms; SamplesPerSecond = 2596
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00690023; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.24592s; TotalTimePerSample = 0.38424ms; SamplesPerSecond = 2602
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00496087; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.24733s; TotalTimePerSample = 0.38646ms; SamplesPerSecond = 2587
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.97859121; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.24469s; TotalTimePerSample = 0.38232ms; SamplesPerSecond = 2615
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.85686638; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.24529s; TotalTimePerSample = 0.38326ms; SamplesPerSecond = 2609
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.24513s; TotalTimePerSample = 0.38302ms; SamplesPerSecond = 2610
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.78653366; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.24605s; TotalTimePerSample = 0.38446ms; SamplesPerSecond = 2601
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.57702529; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24667s; TotalTimePerSample = 0.38542ms; SamplesPerSecond = 2594
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.61570793; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.24435s; TotalTimePerSample = 0.38179ms; SamplesPerSecond = 2619
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.55235603; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.24639s; TotalTimePerSample = 0.38498ms; SamplesPerSecond = 2597
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.48211165; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.24605s; TotalTimePerSample = 0.38445ms; SamplesPerSecond = 2601
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.38778376; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.24590s; TotalTimePerSample = 0.38422ms; SamplesPerSecond = 2602
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.36900911; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24475s; TotalTimePerSample = 0.38242ms; SamplesPerSecond = 2614
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.43967760; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.24451s; TotalTimePerSample = 0.38204ms; SamplesPerSecond = 2617
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30281011; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.24557s; TotalTimePerSample = 0.38370ms; SamplesPerSecond = 2606
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.19669121; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.24469s; TotalTimePerSample = 0.38234ms; SamplesPerSecond = 2615
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.28979560; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.24494s; TotalTimePerSample = 0.38272ms; SamplesPerSecond = 2612
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.17750506; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.24520s; TotalTimePerSample = 0.38312ms; SamplesPerSecond = 2610
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.26264305; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.24493s; TotalTimePerSample = 0.38270ms; SamplesPerSecond = 2613
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15073149; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.24494s; TotalTimePerSample = 0.38271ms; SamplesPerSecond = 2612
-MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799568; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.871133
-MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.32135295; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.24521s; TotalTimePerSample = 0.38315ms; SamplesPerSecond = 2609
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.15070941; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.23443s; TotalTimePerSample = 0.36629ms; SamplesPerSecond = 2730
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.99901066; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22705s; TotalTimePerSample = 0.35477ms; SamplesPerSecond = 2818
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.86945816; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22773s; TotalTimePerSample = 0.35583ms; SamplesPerSecond = 2810
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.80219557; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.22741s; TotalTimePerSample = 0.35533ms; SamplesPerSecond = 2814
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72890766; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.22725s; TotalTimePerSample = 0.35507ms; SamplesPerSecond = 2816
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.56187065; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.22534s; TotalTimePerSample = 0.35209ms; SamplesPerSecond = 2840
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.42790299; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.22941s; TotalTimePerSample = 0.35846ms; SamplesPerSecond = 2789
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33928338; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.22614s; TotalTimePerSample = 0.35334ms; SamplesPerSecond = 2830
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.36398772; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.22771s; TotalTimePerSample = 0.35580ms; SamplesPerSecond = 2810
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.21223693; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.22797s; TotalTimePerSample = 0.35620ms; SamplesPerSecond = 2807
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.31265357; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.22806s; TotalTimePerSample = 0.35634ms; SamplesPerSecond = 2806
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.14081698; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.22797s; TotalTimePerSample = 0.35621ms; SamplesPerSecond = 2807
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00690035; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.22873s; TotalTimePerSample = 0.35740ms; SamplesPerSecond = 2798
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00496066; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.22763s; TotalTimePerSample = 0.35567ms; SamplesPerSecond = 2811
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.97859081; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.22697s; TotalTimePerSample = 0.35465ms; SamplesPerSecond = 2819
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.85686609; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.22767s; TotalTimePerSample = 0.35573ms; SamplesPerSecond = 2811
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.22776s; TotalTimePerSample = 0.35587ms; SamplesPerSecond = 2810
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.78653376; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.22751s; TotalTimePerSample = 0.35548ms; SamplesPerSecond = 2813
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.57702533; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22746s; TotalTimePerSample = 0.35541ms; SamplesPerSecond = 2813
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.61570805; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.22868s; TotalTimePerSample = 0.35731ms; SamplesPerSecond = 2798
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.55235582; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.22821s; TotalTimePerSample = 0.35658ms; SamplesPerSecond = 2804
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.48211151; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.22782s; TotalTimePerSample = 0.35597ms; SamplesPerSecond = 2809
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.38778372; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.22683s; TotalTimePerSample = 0.35443ms; SamplesPerSecond = 2821
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.36900902; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.23027s; TotalTimePerSample = 0.35980ms; SamplesPerSecond = 2779
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.43967781; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.22756s; TotalTimePerSample = 0.35556ms; SamplesPerSecond = 2812
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30281039; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.22812s; TotalTimePerSample = 0.35643ms; SamplesPerSecond = 2805
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.19669146; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.22729s; TotalTimePerSample = 0.35514ms; SamplesPerSecond = 2815
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.28979581; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.22758s; TotalTimePerSample = 0.35560ms; SamplesPerSecond = 2812
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.17750535; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.22775s; TotalTimePerSample = 0.35585ms; SamplesPerSecond = 2810
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.26264398; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.22606s; TotalTimePerSample = 0.35322ms; SamplesPerSecond = 2831
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15073110; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.22674s; TotalTimePerSample = 0.35429ms; SamplesPerSecond = 2822
+MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799569; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.319114
+MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119 
 MPI Rank 1: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 1 of 3, with 1 datapasses
 MPI Rank 1: 
 MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.29109s; TotalTimePerSample = 0.11371ms; SamplesPerSecond = 8794
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98818586; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.28094s; TotalTimePerSample = 0.10974ms; SamplesPerSecond = 9112
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98698123; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.28221s; TotalTimePerSample = 0.11024ms; SamplesPerSecond = 9071
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.93126298; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.27954s; TotalTimePerSample = 0.10919ms; SamplesPerSecond = 9157
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90067741; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.27987s; TotalTimePerSample = 0.10932ms; SamplesPerSecond = 9147
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.97115807; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.27910s; TotalTimePerSample = 0.10902ms; SamplesPerSecond = 9172
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89518067; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.27764s; TotalTimePerSample = 0.10845ms; SamplesPerSecond = 9220
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90450396; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.27505s; TotalTimePerSample = 0.10744ms; SamplesPerSecond = 9307
-MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=2.248133
-MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.01598514; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25371s; TotalTimePerSample = 0.09911ms; SamplesPerSecond = 10090
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98818590; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.24939s; TotalTimePerSample = 0.09742ms; SamplesPerSecond = 10265
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98698122; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25094s; TotalTimePerSample = 0.09802ms; SamplesPerSecond = 10201
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.93126295; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.24942s; TotalTimePerSample = 0.09743ms; SamplesPerSecond = 10263
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90067743; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.24938s; TotalTimePerSample = 0.09741ms; SamplesPerSecond = 10265
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.97115808; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.24886s; TotalTimePerSample = 0.09721ms; SamplesPerSecond = 10286
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89518061; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.24865s; TotalTimePerSample = 0.09713ms; SamplesPerSecond = 10295
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90450394; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.24518s; TotalTimePerSample = 0.09577ms; SamplesPerSecond = 10441
+MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=1.999721
+MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
 MPI Rank 1: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 1 of 3, with 1 datapasses
 MPI Rank 1: 
 MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 1:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87359851; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.44649s; TotalTimePerSample = 0.04360ms; SamplesPerSecond = 22934
-MPI Rank 1:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86656277; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.42652s; TotalTimePerSample = 0.04165ms; SamplesPerSecond = 24008
-MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.880982
+MPI Rank 1:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87359841; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.36750s; TotalTimePerSample = 0.03589ms; SamplesPerSecond = 27863
+MPI Rank 1:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86656271; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.34559s; TotalTimePerSample = 0.03375ms; SamplesPerSecond = 29630
+MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.728446
 MPI Rank 1: CNTKCommandTrainEnd: speechTrain
 MPI Rank 1: COMPLETED
 MPI Rank 1: ~MPIWrapper
-MPI Rank 2: running on localhost at 2015/10/02 13:38:53
-MPI Rank 2: command line options: 
-MPI Rank 2: configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr 
+MPI Rank 2: running on localhost at 2015/10/24 12:56:12
+MPI Rank 2: command line: 
+MPI Rank 2: /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/../cntk.config RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/.. DeviceId=0 stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr 
 MPI Rank 2: 
 MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 2: precision=float
@@ -1054,10 +1281,11 @@ MPI Rank 2:           labelType=Category
 MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ]
-MPI Rank 2: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 2: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 2: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+MPI Rank 2: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 2: DeviceId=0
-MPI Rank 2: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 2: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 2: 
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: 
@@ -1068,7 +1296,7 @@ MPI Rank 2: deviceId=0
 MPI Rank 2: parallelTrain=true
 MPI Rank 2: speechTrain=[
 MPI Rank 2:     action=train
-MPI Rank 2:     modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 2:     modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 2:     deviceId=0
 MPI Rank 2:     traceLevel=1
 MPI Rank 2:     SimpleNetworkBuilder=[
@@ -1151,23 +1379,25 @@ MPI Rank 2:           labelType=Category
 MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ]
-MPI Rank 2: RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 2: RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 2: DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+MPI Rank 2: ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 2: DeviceId=0
-MPI Rank 2: stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 2: stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 2: 
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: 
 MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 2: configparameters: cntk.config:command=speechTrain
+MPI Rank 2: configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/DNN/ParallelNoQuantization/..
 MPI Rank 2: configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
 MPI Rank 2: configparameters: cntk.config:deviceId=0
 MPI Rank 2: configparameters: cntk.config:parallelTrain=true
 MPI Rank 2: configparameters: cntk.config:precision=float
-MPI Rank 2: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 2: configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 2: configparameters: cntk.config:speechTrain=[
 MPI Rank 2:     action=train
-MPI Rank 2:     modelPath=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 2:     modelPath=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 2:     deviceId=0
 MPI Rank 2:     traceLevel=1
 MPI Rank 2:     SimpleNetworkBuilder=[
@@ -1251,10 +1481,11 @@ MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ]
 MPI Rank 2: 
-MPI Rank 2: configparameters: cntk.config:stderr=/tmp/cntk-test-20151002133421.200863/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 2: configparameters: cntk.config:stderr=/tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: command: speechTrain 
 MPI Rank 2: precision = float
+MPI Rank 2: CNTKModelPath: /tmp/cntk-test-20151024125611.671961/Speech/DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 2: CNTKCommandTrainInfo: speechTrain : 3
 MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
 MPI Rank 2: CNTKCommandTrainBegin: speechTrain
@@ -1266,11 +1497,12 @@ MPI Rank 2: htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Sp
 MPI Rank 2: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 MPI Rank 2: label set 0: 129 classes
 MPI Rank 2: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 2: GetTrainCriterionNodes  ...
 MPI Rank 2: GetEvalCriterionNodes  ...
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node CrossEntropyWithSoftmax 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 2: 
 MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
 MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
@@ -1293,13 +1525,57 @@ MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
 MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
 MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
 MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 2: 
 MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 2: Found 6 PreCompute nodes
-MPI Rank 2: 	NodeName: InvStdOfFeatures
-MPI Rank 2: 	NodeName: MeanOfFeatures
-MPI Rank 2: 	NodeName: Prior
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 2: 
 MPI Rank 2: 	NodeName: InvStdOfFeatures
 MPI Rank 2: 	NodeName: MeanOfFeatures
 MPI Rank 2: 	NodeName: Prior
@@ -1307,120 +1583,186 @@ MPI Rank 2: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f
 MPI Rank 2: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node InvStdOfFeatures 
+MPI Rank 2: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 2: 
-MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node InvStdOfFeatures, final verification.
 MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 2: 
 MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node MeanOfFeatures 
 MPI Rank 2: 
-MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 2: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node MeanOfFeatures, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 2: 
 MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node Prior 
 MPI Rank 2: 
-MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 64]
-MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1]
+MPI Rank 2: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node Prior, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 2: 
 MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+MPI Rank 2: 
+MPI Rank 2: Precomputing --> Completed.
+MPI Rank 2: 
 MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
+MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000 
 MPI Rank 2: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 2 of 3, with 1 datapasses
 MPI Rank 2: 
-MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
 MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
 MPI Rank 2: 
-MPI Rank 2: Validating node EvalErrorPrediction 
-MPI Rank 2: 
-MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 7]
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62]
 MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
 MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
 MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
-MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 7]
-MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 7]) -> [363, 1]
-MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 7]) -> [363, 1]
-MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 7], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 7]
-MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 7]) -> [512, MBSize 7]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
-MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 7], B0[512, 1]) -> [512, MBSize 7]
-MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 7]) -> [512, MBSize 7]
-MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 7]) -> [512, MBSize 7]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
-MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 7], B1[512, 1]) -> [512, MBSize 7]
-MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 7]) -> [512, MBSize 7]
-MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 7]) -> [132, MBSize 7]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
 MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
-MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 7], B2[132, 1]) -> [132, MBSize 7]
-MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 7], HLast[132, MBSize 7]) -> [1, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
 MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 2: 
 MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.32135414; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.25673s; TotalTimePerSample = 0.40114ms; SamplesPerSecond = 2492
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.15070930; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.24685s; TotalTimePerSample = 0.38571ms; SamplesPerSecond = 2592
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.99901060; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24575s; TotalTimePerSample = 0.38398ms; SamplesPerSecond = 2604
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.86945780; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.24696s; TotalTimePerSample = 0.38588ms; SamplesPerSecond = 2591
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.80219517; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.24516s; TotalTimePerSample = 0.38307ms; SamplesPerSecond = 2610
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72890717; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.24462s; TotalTimePerSample = 0.38221ms; SamplesPerSecond = 2616
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.56187025; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.24416s; TotalTimePerSample = 0.38150ms; SamplesPerSecond = 2621
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.42790310; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.24565s; TotalTimePerSample = 0.38383ms; SamplesPerSecond = 2605
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33928303; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.24437s; TotalTimePerSample = 0.38183ms; SamplesPerSecond = 2618
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.36398734; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.24545s; TotalTimePerSample = 0.38352ms; SamplesPerSecond = 2607
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.21223679; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.24567s; TotalTimePerSample = 0.38385ms; SamplesPerSecond = 2605
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.31265333; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.24655s; TotalTimePerSample = 0.38523ms; SamplesPerSecond = 2595
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.14081673; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.24650s; TotalTimePerSample = 0.38515ms; SamplesPerSecond = 2596
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00690023; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.24591s; TotalTimePerSample = 0.38424ms; SamplesPerSecond = 2602
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00496087; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.24734s; TotalTimePerSample = 0.38647ms; SamplesPerSecond = 2587
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.97859121; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.24469s; TotalTimePerSample = 0.38233ms; SamplesPerSecond = 2615
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.85686638; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.24529s; TotalTimePerSample = 0.38327ms; SamplesPerSecond = 2609
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.24514s; TotalTimePerSample = 0.38303ms; SamplesPerSecond = 2610
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.78653366; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.24606s; TotalTimePerSample = 0.38447ms; SamplesPerSecond = 2600
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.57702529; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24668s; TotalTimePerSample = 0.38543ms; SamplesPerSecond = 2594
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.61570793; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.24435s; TotalTimePerSample = 0.38180ms; SamplesPerSecond = 2619
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.55235603; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.24639s; TotalTimePerSample = 0.38499ms; SamplesPerSecond = 2597
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.48211165; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.24605s; TotalTimePerSample = 0.38446ms; SamplesPerSecond = 2601
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.38778376; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.24590s; TotalTimePerSample = 0.38423ms; SamplesPerSecond = 2602
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.36900911; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.24475s; TotalTimePerSample = 0.38242ms; SamplesPerSecond = 2614
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.43967760; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.24451s; TotalTimePerSample = 0.38205ms; SamplesPerSecond = 2617
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30281011; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.24558s; TotalTimePerSample = 0.38371ms; SamplesPerSecond = 2606
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.19669121; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.24470s; TotalTimePerSample = 0.38235ms; SamplesPerSecond = 2615
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.28979560; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.24495s; TotalTimePerSample = 0.38273ms; SamplesPerSecond = 2612
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.17750506; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.24520s; TotalTimePerSample = 0.38313ms; SamplesPerSecond = 2610
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.26264305; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.24493s; TotalTimePerSample = 0.38270ms; SamplesPerSecond = 2613
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15073149; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.24465s; TotalTimePerSample = 0.38226ms; SamplesPerSecond = 2616
-MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799568; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.87131
-MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.32135295; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.24689s; TotalTimePerSample = 0.38577ms; SamplesPerSecond = 2592
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.15070941; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.23655s; TotalTimePerSample = 0.36962ms; SamplesPerSecond = 2705
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.99901066; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22706s; TotalTimePerSample = 0.35478ms; SamplesPerSecond = 2818
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.86945816; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.22778s; TotalTimePerSample = 0.35590ms; SamplesPerSecond = 2809
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.80219557; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.22743s; TotalTimePerSample = 0.35536ms; SamplesPerSecond = 2814
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72890766; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.22722s; TotalTimePerSample = 0.35502ms; SamplesPerSecond = 2816
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.56187065; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.22646s; TotalTimePerSample = 0.35384ms; SamplesPerSecond = 2826
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.42790299; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.22831s; TotalTimePerSample = 0.35673ms; SamplesPerSecond = 2803
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33928338; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.22619s; TotalTimePerSample = 0.35342ms; SamplesPerSecond = 2829
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.36398772; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.22774s; TotalTimePerSample = 0.35584ms; SamplesPerSecond = 2810
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.21223693; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.22830s; TotalTimePerSample = 0.35671ms; SamplesPerSecond = 2803
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.31265357; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.22772s; TotalTimePerSample = 0.35581ms; SamplesPerSecond = 2810
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.14081698; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.22802s; TotalTimePerSample = 0.35628ms; SamplesPerSecond = 2806
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00690035; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.22880s; TotalTimePerSample = 0.35751ms; SamplesPerSecond = 2797
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00496066; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.22758s; TotalTimePerSample = 0.35559ms; SamplesPerSecond = 2812
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.97859081; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.22700s; TotalTimePerSample = 0.35468ms; SamplesPerSecond = 2819
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.85686609; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.22768s; TotalTimePerSample = 0.35575ms; SamplesPerSecond = 2810
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.69053374; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.22778s; TotalTimePerSample = 0.35590ms; SamplesPerSecond = 2809
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.78653376; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.22753s; TotalTimePerSample = 0.35551ms; SamplesPerSecond = 2812
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.57702533; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22748s; TotalTimePerSample = 0.35544ms; SamplesPerSecond = 2813
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.61570805; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.22869s; TotalTimePerSample = 0.35733ms; SamplesPerSecond = 2798
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.55235582; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.22823s; TotalTimePerSample = 0.35661ms; SamplesPerSecond = 2804
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.48211151; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.22784s; TotalTimePerSample = 0.35600ms; SamplesPerSecond = 2809
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.38778372; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.22795s; TotalTimePerSample = 0.35618ms; SamplesPerSecond = 2807
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.36900902; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.22914s; TotalTimePerSample = 0.35803ms; SamplesPerSecond = 2793
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.43967781; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.22690s; TotalTimePerSample = 0.35454ms; SamplesPerSecond = 2820
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30281039; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.22876s; TotalTimePerSample = 0.35743ms; SamplesPerSecond = 2797
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.19669146; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.22764s; TotalTimePerSample = 0.35569ms; SamplesPerSecond = 2811
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.28979581; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.22730s; TotalTimePerSample = 0.35515ms; SamplesPerSecond = 2815
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.17750535; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.22705s; TotalTimePerSample = 0.35477ms; SamplesPerSecond = 2818
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.26264398; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.22676s; TotalTimePerSample = 0.35431ms; SamplesPerSecond = 2822
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15073110; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.22676s; TotalTimePerSample = 0.35431ms; SamplesPerSecond = 2822
+MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799569; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=7.319151
+MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119 
 MPI Rank 2: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 2 of 3, with 1 datapasses
 MPI Rank 2: 
 MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.29002s; TotalTimePerSample = 0.11329ms; SamplesPerSecond = 8826
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98818586; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.28095s; TotalTimePerSample = 0.10975ms; SamplesPerSecond = 9111
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98698123; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.28222s; TotalTimePerSample = 0.11024ms; SamplesPerSecond = 9071
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.93126298; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.27954s; TotalTimePerSample = 0.10920ms; SamplesPerSecond = 9157
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90067741; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.27987s; TotalTimePerSample = 0.10933ms; SamplesPerSecond = 9146
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.97115807; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.27910s; TotalTimePerSample = 0.10902ms; SamplesPerSecond = 9172
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89518067; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.27764s; TotalTimePerSample = 0.10846ms; SamplesPerSecond = 9220
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90450396; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.27504s; TotalTimePerSample = 0.10744ms; SamplesPerSecond = 9307
-MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=2.248306
-MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.01598514; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.25401s; TotalTimePerSample = 0.09922ms; SamplesPerSecond = 10078
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98818590; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.24945s; TotalTimePerSample = 0.09744ms; SamplesPerSecond = 10262
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98698122; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.24991s; TotalTimePerSample = 0.09762ms; SamplesPerSecond = 10243
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.93126295; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.24970s; TotalTimePerSample = 0.09754ms; SamplesPerSecond = 10252
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90067743; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.25009s; TotalTimePerSample = 0.09769ms; SamplesPerSecond = 10236
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.97115808; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.24884s; TotalTimePerSample = 0.09720ms; SamplesPerSecond = 10287
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89518061; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.24796s; TotalTimePerSample = 0.09686ms; SamplesPerSecond = 10324
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90450394; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.24485s; TotalTimePerSample = 0.09565ms; SamplesPerSecond = 10455
+MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9492419; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=1.999609
+MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
 MPI Rank 2: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 2 of 3, with 1 datapasses
 MPI Rank 2: 
 MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 2:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87359851; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.44455s; TotalTimePerSample = 0.04341ms; SamplesPerSecond = 23034
-MPI Rank 2:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86656277; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.42652s; TotalTimePerSample = 0.04165ms; SamplesPerSecond = 24008
-MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.881154
+MPI Rank 2:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87359841; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.37054s; TotalTimePerSample = 0.03619ms; SamplesPerSecond = 27635
+MPI Rank 2:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86656271; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.34556s; TotalTimePerSample = 0.03375ms; SamplesPerSecond = 29632
+MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.5184082; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.728405
 MPI Rank 2: CNTKCommandTrainEnd: speechTrain
 MPI Rank 2: COMPLETED
 MPI Rank 2: ~MPIWrapper
diff --git a/Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt b/Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt
index 3449e702d..f658bc92c 100644
--- a/Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt
+++ b/Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt
@@ -1,4 +1,4 @@
-=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 3 E:\NetScale\CNTK\git_repos\cplx_master\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\DNN\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+=== Running C:\Program Files\Microsoft MPI\Bin\/mpiexec.exe -n 3 E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPIWrapper: initializing MPI
 MPIWrapper: initializing MPI
 MPIWrapper: initializing MPI
@@ -7,41 +7,38 @@ ping [requestnodes (before change)]: 3 nodes pinging each other
 ping [requestnodes (before change)]: 3 nodes pinging each other
 ping [requestnodes (before change)]: all 3 nodes responded
 requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (2) are in (participating)
-ping [requestnodes (after change)]: 3 nodes pinging each other
 ping [requestnodes (before change)]: all 3 nodes responded
+ping [requestnodes (after change)]: 3 nodes pinging each other
 requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (1) are in (participating)
 ping [requestnodes (before change)]: all 3 nodes responded
 ping [requestnodes (after change)]: 3 nodes pinging each other
 requestnodes [MPIWrapper]: using 3 out of 3 MPI nodes (3 requested); we (0) are in (participating)
 ping [requestnodes (after change)]: 3 nodes pinging each other
 ping [requestnodes (after change)]: all 3 nodes responded
-ping [requestnodes (after change)]: all 3 nodes responded
-mpihelper: we are cog 0 in a gearbox of 3
-ping [requestnodes (after change)]: all 3 nodes responded
 mpihelper: we are cog 1 in a gearbox of 3
+ping [requestnodes (after change)]: all 3 nodes responded
+ping [requestnodes (after change)]: all 3 nodes responded
 ping [mpihelper]: 3 nodes pinging each other
+mpihelper: we are cog 0 in a gearbox of 3
 mpihelper: we are cog 2 in a gearbox of 3
 ping [mpihelper]: 3 nodes pinging each other
 ping [mpihelper]: 3 nodes pinging each other
 ping [mpihelper]: all 3 nodes responded
 ping [mpihelper]: all 3 nodes responded
 ping [mpihelper]: all 3 nodes responded
-CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
-CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
-CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
-MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank0
+MPI Rank 0: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank0
 MPI Rank 0: -------------------------------------------------------------------
 MPI Rank 0: Build info: 
 MPI Rank 0: 
-MPI Rank 0: 		Built time: Oct  2 2015 13:14:34
-MPI Rank 0: 		Last modified date: Fri Oct  2 13:09:06 2015
+MPI Rank 0: 		Built time: Oct 24 2015 13:33:25
+MPI Rank 0: 		Last modified date: Thu Oct 22 16:00:27 2015
 MPI Rank 0: 		Built by amitaga on Amitaga-Win-DT3           
-MPI Rank 0: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master\MachineLearning\CNTK\
+MPI Rank 0: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
 MPI Rank 0: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
 MPI Rank 0: -------------------------------------------------------------------
-MPI Rank 0: running on Amitaga-Win-DT3 at 2015/10/02 21:20:29
-MPI Rank 0: command line options: 
-MPI Rank 0: configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\DNN\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr 
+MPI Rank 0: running on Amitaga-Win-DT3 at 2015/10/24 22:14:12
+MPI Rank 0: command line: 
+MPI Rank 0: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr 
 MPI Rank 0: 
 MPI Rank 0: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 0: precision=float
@@ -133,10 +130,11 @@ MPI Rank 0:           labelType=Category
 MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ]
-MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
-MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+MPI Rank 0: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
 MPI Rank 0: DeviceId=0
-MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 0: 
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: 
@@ -147,7 +145,7 @@ MPI Rank 0: deviceId=0
 MPI Rank 0: parallelTrain=true
 MPI Rank 0: speechTrain=[
 MPI Rank 0:     action=train
-MPI Rank 0:     modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 0:     modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 0:     deviceId=0
 MPI Rank 0:     traceLevel=1
 MPI Rank 0:     SimpleNetworkBuilder=[
@@ -223,30 +221,32 @@ MPI Rank 0:           type=Real
 MPI Rank 0:           scpFile=glob_0000.scp
 MPI Rank 0:       ]
 MPI Rank 0:       labels=[
-MPI Rank 0:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf
-MPI Rank 0:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
+MPI Rank 0:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+MPI Rank 0:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
 MPI Rank 0:           labelDim=132
 MPI Rank 0:           labelType=Category
 MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ]
-MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
-MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 0: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 0: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+MPI Rank 0: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
 MPI Rank 0: DeviceId=0
-MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 0: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 0: 
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: 
 MPI Rank 0: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 0: configparameters: cntk.config:command=speechTrain
-MPI Rank 0: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 0: configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
+MPI Rank 0: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
 MPI Rank 0: configparameters: cntk.config:deviceId=0
 MPI Rank 0: configparameters: cntk.config:parallelTrain=true
 MPI Rank 0: configparameters: cntk.config:precision=float
-MPI Rank 0: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 0: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 0: configparameters: cntk.config:speechTrain=[
 MPI Rank 0:     action=train
-MPI Rank 0:     modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 0:     modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 0:     deviceId=0
 MPI Rank 0:     traceLevel=1
 MPI Rank 0:     SimpleNetworkBuilder=[
@@ -322,34 +322,36 @@ MPI Rank 0:           type=Real
 MPI Rank 0:           scpFile=glob_0000.scp
 MPI Rank 0:       ]
 MPI Rank 0:       labels=[
-MPI Rank 0:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf
-MPI Rank 0:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
+MPI Rank 0:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+MPI Rank 0:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
 MPI Rank 0:           labelDim=132
 MPI Rank 0:           labelType=Category
 MPI Rank 0:       ]
 MPI Rank 0:     ]
 MPI Rank 0: ]
 MPI Rank 0: 
-MPI Rank 0: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 0: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 0: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 0: command: speechTrain 
 MPI Rank 0: precision = float
+MPI Rank 0: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 0: CNTKCommandTrainInfo: speechTrain : 3
 MPI Rank 0: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
 MPI Rank 0: CNTKCommandTrainBegin: speechTrain
 MPI Rank 0: SimpleNetworkBuilder Using GPU 0
 MPI Rank 0: reading script file glob_0000.scp ... 948 entries
 MPI Rank 0: trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
-MPI Rank 0: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
-MPI Rank 0: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+MPI Rank 0: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
+MPI Rank 0: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
 MPI Rank 0: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 MPI Rank 0: label set 0: 129 classes
 MPI Rank 0: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+MPI Rank 0: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 0: GetTrainCriterionNodes  ...
 MPI Rank 0: GetEvalCriterionNodes  ...
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node CrossEntropyWithSoftmax 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 0: 
 MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
 MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
@@ -372,13 +374,57 @@ MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
 MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
 MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
 MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 0: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 0: 
 MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 0: Found 6 PreCompute nodes
-MPI Rank 0: 	NodeName: InvStdOfFeatures
-MPI Rank 0: 	NodeName: MeanOfFeatures
-MPI Rank 0: 	NodeName: Prior
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 0: 
 MPI Rank 0: 	NodeName: InvStdOfFeatures
 MPI Rank 0: 	NodeName: MeanOfFeatures
 MPI Rank 0: 	NodeName: Prior
@@ -386,136 +432,201 @@ MPI Rank 0: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f
 MPI Rank 0: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node InvStdOfFeatures 
+MPI Rank 0: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 0: 
-MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node InvStdOfFeatures, final verification.
 MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 0: 
 MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node MeanOfFeatures 
 MPI Rank 0: 
-MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 0: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node MeanOfFeatures, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 0: 
 MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 0: 
 MPI Rank 0: 
-MPI Rank 0: Validating node Prior 
 MPI Rank 0: 
-MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 64]
-MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1]
+MPI Rank 0: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node Prior, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 0: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 0: 
 MPI Rank 0: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Precomputing --> Completed.
+MPI Rank 0: 
 MPI Rank 0: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
+MPI Rank 0: Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000 
 MPI Rank 0: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 3, with 1 datapasses
 MPI Rank 0: 
-MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
 MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
 MPI Rank 0: 
-MPI Rank 0: Validating node EvalErrorPrediction 
-MPI Rank 0: 
-MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 33]
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62]
 MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
 MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
 MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
-MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 33]
-MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 33]) -> [363, 1]
-MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 33]) -> [363, 1]
-MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 33], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 33]
-MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 33]) -> [512, MBSize 33]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
-MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 33], B0[512, 1]) -> [512, MBSize 33]
-MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 33]) -> [512, MBSize 33]
-MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 33]) -> [512, MBSize 33]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
-MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 33], B1[512, 1]) -> [512, MBSize 33]
-MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 33]) -> [512, MBSize 33]
-MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 33]) -> [132, MBSize 33]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
 MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
-MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 33], B2[132, 1]) -> [132, MBSize 33]
-MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 33], HLast[132, MBSize 33]) -> [1, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
 MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+MPI Rank 0: 
+MPI Rank 0: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 0: 
+MPI Rank 0: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 0: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 0: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 0: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 0: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 0: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 0: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 0: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 0: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 0: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 0: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 0: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 0: 
 MPI Rank 0: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45646170; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.49931s; TotalTimePerSample = 0.78017ms; SamplesPerSecond = 1281
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315661; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.41658s; TotalTimePerSample = 0.65091ms; SamplesPerSecond = 1536
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180607; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.39417s; TotalTimePerSample = 0.61589ms; SamplesPerSecond = 1623
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94158019; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.40679s; TotalTimePerSample = 0.63561ms; SamplesPerSecond = 1573
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668726; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.38408s; TotalTimePerSample = 0.60013ms; SamplesPerSecond = 1666
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866371; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.38362s; TotalTimePerSample = 0.59940ms; SamplesPerSecond = 1668
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51808934; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.37649s; TotalTimePerSample = 0.58826ms; SamplesPerSecond = 1699
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48455124; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.37230s; TotalTimePerSample = 0.58172ms; SamplesPerSecond = 1719
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829281; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.38200s; TotalTimePerSample = 0.59688ms; SamplesPerSecond = 1675
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167446; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.37789s; TotalTimePerSample = 0.59045ms; SamplesPerSecond = 1693
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861682; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.37564s; TotalTimePerSample = 0.58693ms; SamplesPerSecond = 1703
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32616995; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.37496s; TotalTimePerSample = 0.58588ms; SamplesPerSecond = 1706
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16897953; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.37711s; TotalTimePerSample = 0.58923ms; SamplesPerSecond = 1697
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08892002; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.38238s; TotalTimePerSample = 0.59747ms; SamplesPerSecond = 1673
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004848; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37484s; TotalTimePerSample = 0.58569ms; SamplesPerSecond = 1707
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128321; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.37006s; TotalTimePerSample = 0.57822ms; SamplesPerSecond = 1729
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90171920; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37733s; TotalTimePerSample = 0.58958ms; SamplesPerSecond = 1696
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73262413; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.37413s; TotalTimePerSample = 0.58458ms; SamplesPerSecond = 1710
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515363; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.38770s; TotalTimePerSample = 0.60578ms; SamplesPerSecond = 1650
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67382489; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.37570s; TotalTimePerSample = 0.58703ms; SamplesPerSecond = 1703
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869718; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37952s; TotalTimePerSample = 0.59299ms; SamplesPerSecond = 1686
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60031970; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.37927s; TotalTimePerSample = 0.59261ms; SamplesPerSecond = 1687
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134087; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.37869s; TotalTimePerSample = 0.59171ms; SamplesPerSecond = 1690
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362164; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.37509s; TotalTimePerSample = 0.58608ms; SamplesPerSecond = 1706
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640677; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.36971s; TotalTimePerSample = 0.57768ms; SamplesPerSecond = 1731
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745369; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.37111s; TotalTimePerSample = 0.57986ms; SamplesPerSecond = 1724
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16416032; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.37536s; TotalTimePerSample = 0.58650ms; SamplesPerSecond = 1705
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30346910; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37963s; TotalTimePerSample = 0.59317ms; SamplesPerSecond = 1685
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398823; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.37262s; TotalTimePerSample = 0.58221ms; SamplesPerSecond = 1717
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322470; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.37330s; TotalTimePerSample = 0.58328ms; SamplesPerSecond = 1714
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664598; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.38548s; TotalTimePerSample = 0.60232ms; SamplesPerSecond = 1660
-MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246635; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.37795s; TotalTimePerSample = 0.59054ms; SamplesPerSecond = 1693
-MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=12.320558
-MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
+MPI Rank 0: 
+MPI Rank 0: 
+MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.33332s; TotalTimePerSample = 0.52081ms; SamplesPerSecond = 1920
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315785; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.28125s; TotalTimePerSample = 0.43946ms; SamplesPerSecond = 2275
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180676; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.25998s; TotalTimePerSample = 0.40622ms; SamplesPerSecond = 2461
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94158071; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.28931s; TotalTimePerSample = 0.45204ms; SamplesPerSecond = 2212
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668763; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.26162s; TotalTimePerSample = 0.40878ms; SamplesPerSecond = 2446
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866399; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.24786s; TotalTimePerSample = 0.38728ms; SamplesPerSecond = 2582
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51808951; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.26001s; TotalTimePerSample = 0.40627ms; SamplesPerSecond = 2461
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48455147; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.25240s; TotalTimePerSample = 0.39438ms; SamplesPerSecond = 2535
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829288; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.23911s; TotalTimePerSample = 0.37360ms; SamplesPerSecond = 2676
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167490; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.24356s; TotalTimePerSample = 0.38057ms; SamplesPerSecond = 2627
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861768; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.23348s; TotalTimePerSample = 0.36482ms; SamplesPerSecond = 2741
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32617094; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.23074s; TotalTimePerSample = 0.36054ms; SamplesPerSecond = 2773
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16898033; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.22036s; TotalTimePerSample = 0.34431ms; SamplesPerSecond = 2904
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08892100; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.21850s; TotalTimePerSample = 0.34140ms; SamplesPerSecond = 2929
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004828; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.24011s; TotalTimePerSample = 0.37518ms; SamplesPerSecond = 2665
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128317; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.32731s; TotalTimePerSample = 0.51141ms; SamplesPerSecond = 1955
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90171901; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.32067s; TotalTimePerSample = 0.50105ms; SamplesPerSecond = 1995
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73262447; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.33546s; TotalTimePerSample = 0.52416ms; SamplesPerSecond = 1907
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515410; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.35016s; TotalTimePerSample = 0.54712ms; SamplesPerSecond = 1827
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67382540; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.33016s; TotalTimePerSample = 0.51587ms; SamplesPerSecond = 1938
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869780; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.35647s; TotalTimePerSample = 0.55699ms; SamplesPerSecond = 1795
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60032086; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.36474s; TotalTimePerSample = 0.56991ms; SamplesPerSecond = 1754
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134188; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.33930s; TotalTimePerSample = 0.53015ms; SamplesPerSecond = 1886
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362252; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.34390s; TotalTimePerSample = 0.53735ms; SamplesPerSecond = 1860
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640740; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.33520s; TotalTimePerSample = 0.52375ms; SamplesPerSecond = 1909
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745478; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.34931s; TotalTimePerSample = 0.54580ms; SamplesPerSecond = 1832
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16416053; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.36688s; TotalTimePerSample = 0.57324ms; SamplesPerSecond = 1744
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30346869; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37710s; TotalTimePerSample = 0.58922ms; SamplesPerSecond = 1697
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398831; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.36403s; TotalTimePerSample = 0.56879ms; SamplesPerSecond = 1758
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322487; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.34077s; TotalTimePerSample = 0.53246ms; SamplesPerSecond = 1878
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664627; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.35019s; TotalTimePerSample = 0.54718ms; SamplesPerSecond = 1827
+MPI Rank 0:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246685; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.32892s; TotalTimePerSample = 0.51394ms; SamplesPerSecond = 1945
+MPI Rank 0: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000035; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=9.770017
+MPI Rank 0: Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119 
 MPI Rank 0: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 3, with 1 datapasses
 MPI Rank 0: 
 MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151923; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.49257s; TotalTimePerSample = 0.19241ms; SamplesPerSecond = 5197
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395650; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.45941s; TotalTimePerSample = 0.17946ms; SamplesPerSecond = 5572
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575441; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.47006s; TotalTimePerSample = 0.18362ms; SamplesPerSecond = 5446
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90485007; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.47298s; TotalTimePerSample = 0.18476ms; SamplesPerSecond = 5412
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324108; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.48624s; TotalTimePerSample = 0.18994ms; SamplesPerSecond = 5264
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109287; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.48020s; TotalTimePerSample = 0.18758ms; SamplesPerSecond = 5331
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496218; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.45121s; TotalTimePerSample = 0.17625ms; SamplesPerSecond = 5673
-MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944253; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.43800s; TotalTimePerSample = 0.17110ms; SamplesPerSecond = 5844
-MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.769975
-MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151948; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.56668s; TotalTimePerSample = 0.22136ms; SamplesPerSecond = 4517
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395688; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.46958s; TotalTimePerSample = 0.18343ms; SamplesPerSecond = 5451
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575479; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.41833s; TotalTimePerSample = 0.16341ms; SamplesPerSecond = 6119
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90485039; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.33510s; TotalTimePerSample = 0.13090ms; SamplesPerSecond = 7639
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324146; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.31206s; TotalTimePerSample = 0.12190ms; SamplesPerSecond = 8203
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109327; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.29685s; TotalTimePerSample = 0.11596ms; SamplesPerSecond = 8623
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496253; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.32672s; TotalTimePerSample = 0.12762ms; SamplesPerSecond = 7835
+MPI Rank 0:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944295; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.34130s; TotalTimePerSample = 0.13332ms; SamplesPerSecond = 7500
+MPI Rank 0: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356027; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.100422
+MPI Rank 0: Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
 MPI Rank 0: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 3, with 1 datapasses
 MPI Rank 0: 
 MPI Rank 0: Starting minibatch loop, DataParallelSGD training (MyRank = 0, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 0:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752815; EvalErr[0]PerSample = 0.52177734; TotalTime = 0.95049s; TotalTimePerSample = 0.09282ms; SamplesPerSecond = 10773
-MPI Rank 0:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358797; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.79398s; TotalTimePerSample = 0.07754ms; SamplesPerSecond = 12897
-MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705581; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.792685
+MPI Rank 0:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752856; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.01461s; TotalTimePerSample = 0.09908ms; SamplesPerSecond = 10092
+MPI Rank 0:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358831; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.82689s; TotalTimePerSample = 0.08075ms; SamplesPerSecond = 12383
+MPI Rank 0: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.909274
 MPI Rank 0: CNTKCommandTrainEnd: speechTrain
 MPI Rank 0: COMPLETED
 MPI Rank 0: ~MPIWrapper
-MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank1
+MPI Rank 1: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank1
 MPI Rank 1: -------------------------------------------------------------------
 MPI Rank 1: Build info: 
 MPI Rank 1: 
-MPI Rank 1: 		Built time: Oct  2 2015 13:14:34
-MPI Rank 1: 		Last modified date: Fri Oct  2 13:09:06 2015
+MPI Rank 1: 		Built time: Oct 24 2015 13:33:25
+MPI Rank 1: 		Last modified date: Thu Oct 22 16:00:27 2015
 MPI Rank 1: 		Built by amitaga on Amitaga-Win-DT3           
-MPI Rank 1: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master\MachineLearning\CNTK\
+MPI Rank 1: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
 MPI Rank 1: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
 MPI Rank 1: -------------------------------------------------------------------
-MPI Rank 1: running on Amitaga-Win-DT3 at 2015/10/02 21:20:29
-MPI Rank 1: command line options: 
-MPI Rank 1: configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\DNN\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr 
+MPI Rank 1: running on Amitaga-Win-DT3 at 2015/10/24 22:14:12
+MPI Rank 1: command line: 
+MPI Rank 1: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr 
 MPI Rank 1: 
 MPI Rank 1: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 1: precision=float
@@ -607,10 +718,11 @@ MPI Rank 1:           labelType=Category
 MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ]
-MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
-MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+MPI Rank 1: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
 MPI Rank 1: DeviceId=0
-MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 1: 
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: 
@@ -621,7 +733,7 @@ MPI Rank 1: deviceId=0
 MPI Rank 1: parallelTrain=true
 MPI Rank 1: speechTrain=[
 MPI Rank 1:     action=train
-MPI Rank 1:     modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 1:     modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 1:     deviceId=0
 MPI Rank 1:     traceLevel=1
 MPI Rank 1:     SimpleNetworkBuilder=[
@@ -697,30 +809,32 @@ MPI Rank 1:           type=Real
 MPI Rank 1:           scpFile=glob_0000.scp
 MPI Rank 1:       ]
 MPI Rank 1:       labels=[
-MPI Rank 1:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf
-MPI Rank 1:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
+MPI Rank 1:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+MPI Rank 1:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
 MPI Rank 1:           labelDim=132
 MPI Rank 1:           labelType=Category
 MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ]
-MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
-MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 1: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 1: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+MPI Rank 1: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
 MPI Rank 1: DeviceId=0
-MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 1: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 1: 
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: 
 MPI Rank 1: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 1: configparameters: cntk.config:command=speechTrain
-MPI Rank 1: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 1: configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
+MPI Rank 1: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
 MPI Rank 1: configparameters: cntk.config:deviceId=0
 MPI Rank 1: configparameters: cntk.config:parallelTrain=true
 MPI Rank 1: configparameters: cntk.config:precision=float
-MPI Rank 1: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 1: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 1: configparameters: cntk.config:speechTrain=[
 MPI Rank 1:     action=train
-MPI Rank 1:     modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 1:     modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 1:     deviceId=0
 MPI Rank 1:     traceLevel=1
 MPI Rank 1:     SimpleNetworkBuilder=[
@@ -796,34 +910,36 @@ MPI Rank 1:           type=Real
 MPI Rank 1:           scpFile=glob_0000.scp
 MPI Rank 1:       ]
 MPI Rank 1:       labels=[
-MPI Rank 1:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf
-MPI Rank 1:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
+MPI Rank 1:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+MPI Rank 1:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
 MPI Rank 1:           labelDim=132
 MPI Rank 1:           labelType=Category
 MPI Rank 1:       ]
 MPI Rank 1:     ]
 MPI Rank 1: ]
 MPI Rank 1: 
-MPI Rank 1: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 1: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 1: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 1: command: speechTrain 
 MPI Rank 1: precision = float
+MPI Rank 1: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 1: CNTKCommandTrainInfo: speechTrain : 3
 MPI Rank 1: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
 MPI Rank 1: CNTKCommandTrainBegin: speechTrain
 MPI Rank 1: SimpleNetworkBuilder Using GPU 0
 MPI Rank 1: reading script file glob_0000.scp ... 948 entries
 MPI Rank 1: trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
-MPI Rank 1: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
-MPI Rank 1: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+MPI Rank 1: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
+MPI Rank 1: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
 MPI Rank 1: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 MPI Rank 1: label set 0: 129 classes
 MPI Rank 1: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+MPI Rank 1: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 1: GetTrainCriterionNodes  ...
 MPI Rank 1: GetEvalCriterionNodes  ...
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node CrossEntropyWithSoftmax 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 1: 
 MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
 MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
@@ -846,13 +962,57 @@ MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
 MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
 MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
 MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 1: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 1: 
 MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 1: Found 6 PreCompute nodes
-MPI Rank 1: 	NodeName: InvStdOfFeatures
-MPI Rank 1: 	NodeName: MeanOfFeatures
-MPI Rank 1: 	NodeName: Prior
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 1: 
 MPI Rank 1: 	NodeName: InvStdOfFeatures
 MPI Rank 1: 	NodeName: MeanOfFeatures
 MPI Rank 1: 	NodeName: Prior
@@ -860,136 +1020,201 @@ MPI Rank 1: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f
 MPI Rank 1: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node InvStdOfFeatures 
+MPI Rank 1: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 1: 
-MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node InvStdOfFeatures, final verification.
 MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 1: 
 MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node MeanOfFeatures 
 MPI Rank 1: 
-MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 1: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node MeanOfFeatures, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 1: 
 MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 1: 
 MPI Rank 1: 
-MPI Rank 1: Validating node Prior 
 MPI Rank 1: 
-MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 64]
-MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1]
+MPI Rank 1: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node Prior, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 1: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 1: 
 MPI Rank 1: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Precomputing --> Completed.
+MPI Rank 1: 
 MPI Rank 1: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
+MPI Rank 1: Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000 
 MPI Rank 1: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 1 of 3, with 1 datapasses
 MPI Rank 1: 
-MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
 MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
 MPI Rank 1: 
-MPI Rank 1: Validating node EvalErrorPrediction 
-MPI Rank 1: 
-MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 15]
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62]
 MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
 MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
 MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
-MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 15]
-MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 15]) -> [363, 1]
-MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 15]) -> [363, 1]
-MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 15], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 15]
-MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 15]) -> [512, MBSize 15]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
-MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 15], B0[512, 1]) -> [512, MBSize 15]
-MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 15]) -> [512, MBSize 15]
-MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 15]) -> [512, MBSize 15]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
-MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 15], B1[512, 1]) -> [512, MBSize 15]
-MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 15]) -> [512, MBSize 15]
-MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 15]) -> [132, MBSize 15]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
 MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
-MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 15], B2[132, 1]) -> [132, MBSize 15]
-MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 15], HLast[132, MBSize 15]) -> [1, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
 MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+MPI Rank 1: 
+MPI Rank 1: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 1: 
+MPI Rank 1: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 1: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 1: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 1: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 1: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 1: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 1: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 1: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 1: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 1: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 1: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 1: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 1: 
 MPI Rank 1: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45646170; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.50010s; TotalTimePerSample = 0.78140ms; SamplesPerSecond = 1279
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315661; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.41657s; TotalTimePerSample = 0.65089ms; SamplesPerSecond = 1536
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180607; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.39416s; TotalTimePerSample = 0.61588ms; SamplesPerSecond = 1623
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94158019; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.40678s; TotalTimePerSample = 0.63559ms; SamplesPerSecond = 1573
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668726; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.38408s; TotalTimePerSample = 0.60012ms; SamplesPerSecond = 1666
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866371; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.38362s; TotalTimePerSample = 0.59941ms; SamplesPerSecond = 1668
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51808934; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.37649s; TotalTimePerSample = 0.58827ms; SamplesPerSecond = 1699
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48455124; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.37226s; TotalTimePerSample = 0.58166ms; SamplesPerSecond = 1719
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829281; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.38200s; TotalTimePerSample = 0.59687ms; SamplesPerSecond = 1675
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167446; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.37789s; TotalTimePerSample = 0.59045ms; SamplesPerSecond = 1693
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861682; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.37564s; TotalTimePerSample = 0.58693ms; SamplesPerSecond = 1703
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32616995; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.37496s; TotalTimePerSample = 0.58588ms; SamplesPerSecond = 1706
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16897953; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.37711s; TotalTimePerSample = 0.58923ms; SamplesPerSecond = 1697
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08892002; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.38239s; TotalTimePerSample = 0.59748ms; SamplesPerSecond = 1673
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004848; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37484s; TotalTimePerSample = 0.58569ms; SamplesPerSecond = 1707
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128321; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.37009s; TotalTimePerSample = 0.57826ms; SamplesPerSecond = 1729
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90171920; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37731s; TotalTimePerSample = 0.58955ms; SamplesPerSecond = 1696
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73262413; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.37414s; TotalTimePerSample = 0.58459ms; SamplesPerSecond = 1710
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515363; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.38771s; TotalTimePerSample = 0.60580ms; SamplesPerSecond = 1650
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67382489; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.37570s; TotalTimePerSample = 0.58703ms; SamplesPerSecond = 1703
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869718; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37950s; TotalTimePerSample = 0.59297ms; SamplesPerSecond = 1686
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60031970; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.37925s; TotalTimePerSample = 0.59258ms; SamplesPerSecond = 1687
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134087; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.37868s; TotalTimePerSample = 0.59169ms; SamplesPerSecond = 1690
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362164; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.37507s; TotalTimePerSample = 0.58605ms; SamplesPerSecond = 1706
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640677; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.36971s; TotalTimePerSample = 0.57767ms; SamplesPerSecond = 1731
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745369; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.37111s; TotalTimePerSample = 0.57986ms; SamplesPerSecond = 1724
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16416032; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.37534s; TotalTimePerSample = 0.58647ms; SamplesPerSecond = 1705
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30346910; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37964s; TotalTimePerSample = 0.59319ms; SamplesPerSecond = 1685
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398823; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.37259s; TotalTimePerSample = 0.58217ms; SamplesPerSecond = 1717
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322470; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.37329s; TotalTimePerSample = 0.58327ms; SamplesPerSecond = 1714
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664598; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.38547s; TotalTimePerSample = 0.60230ms; SamplesPerSecond = 1660
-MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246635; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.37794s; TotalTimePerSample = 0.59053ms; SamplesPerSecond = 1693
-MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=12.320764
-MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
+MPI Rank 1: 
+MPI Rank 1: 
+MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.33798s; TotalTimePerSample = 0.52810ms; SamplesPerSecond = 1893
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315785; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.28013s; TotalTimePerSample = 0.43770ms; SamplesPerSecond = 2284
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180676; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.25668s; TotalTimePerSample = 0.40105ms; SamplesPerSecond = 2493
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94158071; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.29127s; TotalTimePerSample = 0.45511ms; SamplesPerSecond = 2197
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668763; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.26257s; TotalTimePerSample = 0.41026ms; SamplesPerSecond = 2437
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866399; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.24697s; TotalTimePerSample = 0.38590ms; SamplesPerSecond = 2591
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51808951; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.25941s; TotalTimePerSample = 0.40532ms; SamplesPerSecond = 2467
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48455147; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.25278s; TotalTimePerSample = 0.39497ms; SamplesPerSecond = 2531
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829288; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.23921s; TotalTimePerSample = 0.37376ms; SamplesPerSecond = 2675
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167490; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.24341s; TotalTimePerSample = 0.38032ms; SamplesPerSecond = 2629
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861768; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.23534s; TotalTimePerSample = 0.36772ms; SamplesPerSecond = 2719
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32617094; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.22909s; TotalTimePerSample = 0.35795ms; SamplesPerSecond = 2793
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16898033; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.21901s; TotalTimePerSample = 0.34220ms; SamplesPerSecond = 2922
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08892100; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.21937s; TotalTimePerSample = 0.34277ms; SamplesPerSecond = 2917
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004828; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.24068s; TotalTimePerSample = 0.37606ms; SamplesPerSecond = 2659
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128317; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.32338s; TotalTimePerSample = 0.50528ms; SamplesPerSecond = 1979
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90171901; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.32313s; TotalTimePerSample = 0.50489ms; SamplesPerSecond = 1980
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73262447; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.33982s; TotalTimePerSample = 0.53097ms; SamplesPerSecond = 1883
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515410; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.34907s; TotalTimePerSample = 0.54543ms; SamplesPerSecond = 1833
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67382540; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.33119s; TotalTimePerSample = 0.51748ms; SamplesPerSecond = 1932
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869780; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.35522s; TotalTimePerSample = 0.55503ms; SamplesPerSecond = 1801
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60032086; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.36689s; TotalTimePerSample = 0.57327ms; SamplesPerSecond = 1744
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134188; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.33598s; TotalTimePerSample = 0.52497ms; SamplesPerSecond = 1904
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362252; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.34488s; TotalTimePerSample = 0.53888ms; SamplesPerSecond = 1855
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640740; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.33076s; TotalTimePerSample = 0.51681ms; SamplesPerSecond = 1934
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745478; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.35552s; TotalTimePerSample = 0.55550ms; SamplesPerSecond = 1800
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16416053; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.36347s; TotalTimePerSample = 0.56791ms; SamplesPerSecond = 1760
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30346869; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37626s; TotalTimePerSample = 0.58791ms; SamplesPerSecond = 1700
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398831; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.36571s; TotalTimePerSample = 0.57142ms; SamplesPerSecond = 1750
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322487; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.34686s; TotalTimePerSample = 0.54198ms; SamplesPerSecond = 1845
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664627; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.34436s; TotalTimePerSample = 0.53806ms; SamplesPerSecond = 1858
+MPI Rank 1:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246685; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.33067s; TotalTimePerSample = 0.51667ms; SamplesPerSecond = 1935
+MPI Rank 1: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000035; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=9.771287
+MPI Rank 1: Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119 
 MPI Rank 1: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 1 of 3, with 1 datapasses
 MPI Rank 1: 
 MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151923; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.49230s; TotalTimePerSample = 0.19230ms; SamplesPerSecond = 5200
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395650; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.45941s; TotalTimePerSample = 0.17946ms; SamplesPerSecond = 5572
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575441; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.47004s; TotalTimePerSample = 0.18361ms; SamplesPerSecond = 5446
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90485007; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.47297s; TotalTimePerSample = 0.18476ms; SamplesPerSecond = 5412
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324108; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.48623s; TotalTimePerSample = 0.18993ms; SamplesPerSecond = 5265
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109287; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.48019s; TotalTimePerSample = 0.18757ms; SamplesPerSecond = 5331
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496218; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.45120s; TotalTimePerSample = 0.17625ms; SamplesPerSecond = 5673
-MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944253; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.43800s; TotalTimePerSample = 0.17109ms; SamplesPerSecond = 5844
-MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.770171
-MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151948; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.56765s; TotalTimePerSample = 0.22174ms; SamplesPerSecond = 4509
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395688; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.46990s; TotalTimePerSample = 0.18356ms; SamplesPerSecond = 5447
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575479; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.42066s; TotalTimePerSample = 0.16432ms; SamplesPerSecond = 6085
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90485039; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.33456s; TotalTimePerSample = 0.13069ms; SamplesPerSecond = 7651
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324146; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.31133s; TotalTimePerSample = 0.12161ms; SamplesPerSecond = 8222
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109327; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.29677s; TotalTimePerSample = 0.11593ms; SamplesPerSecond = 8626
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496253; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.32641s; TotalTimePerSample = 0.12750ms; SamplesPerSecond = 7842
+MPI Rank 1:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944295; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.34823s; TotalTimePerSample = 0.13603ms; SamplesPerSecond = 7351
+MPI Rank 1: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356027; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.105348
+MPI Rank 1: Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
 MPI Rank 1: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 1 of 3, with 1 datapasses
 MPI Rank 1: 
 MPI Rank 1: Starting minibatch loop, DataParallelSGD training (MyRank = 1, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 1:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752815; EvalErr[0]PerSample = 0.52177734; TotalTime = 0.95579s; TotalTimePerSample = 0.09334ms; SamplesPerSecond = 10713
-MPI Rank 1:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358797; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.79393s; TotalTimePerSample = 0.07753ms; SamplesPerSecond = 12897
-MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705581; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.792467
+MPI Rank 1:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752856; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.01546s; TotalTimePerSample = 0.09917ms; SamplesPerSecond = 10084
+MPI Rank 1:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358831; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.82899s; TotalTimePerSample = 0.08096ms; SamplesPerSecond = 12352
+MPI Rank 1: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.90997
 MPI Rank 1: CNTKCommandTrainEnd: speechTrain
 MPI Rank 1: COMPLETED
 MPI Rank 1: ~MPIWrapper
-MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank2
+MPI Rank 2: Redirecting stderr to file C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr_speechTrain.logrank2
 MPI Rank 2: -------------------------------------------------------------------
 MPI Rank 2: Build info: 
 MPI Rank 2: 
-MPI Rank 2: 		Built time: Oct  2 2015 13:14:34
-MPI Rank 2: 		Last modified date: Fri Oct  2 13:09:06 2015
+MPI Rank 2: 		Built time: Oct 24 2015 13:33:25
+MPI Rank 2: 		Last modified date: Thu Oct 22 16:00:27 2015
 MPI Rank 2: 		Built by amitaga on Amitaga-Win-DT3           
-MPI Rank 2: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master\MachineLearning\CNTK\
+MPI Rank 2: 		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
 MPI Rank 2: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
 MPI Rank 2: -------------------------------------------------------------------
-MPI Rank 2: running on Amitaga-Win-DT3 at 2015/10/02 21:20:30
-MPI Rank 2: command line options: 
-MPI Rank 2: configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\DNN\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr 
+MPI Rank 2: running on Amitaga-Win-DT3 at 2015/10/24 22:14:13
+MPI Rank 2: command line: 
+MPI Rank 2: E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN DeviceId=0 stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr 
 MPI Rank 2: 
 MPI Rank 2: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 MPI Rank 2: precision=float
@@ -1081,10 +1306,11 @@ MPI Rank 2:           labelType=Category
 MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ]
-MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
-MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+MPI Rank 2: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
 MPI Rank 2: DeviceId=0
-MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 2: 
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: 
@@ -1095,7 +1321,7 @@ MPI Rank 2: deviceId=0
 MPI Rank 2: parallelTrain=true
 MPI Rank 2: speechTrain=[
 MPI Rank 2:     action=train
-MPI Rank 2:     modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 2:     modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 2:     deviceId=0
 MPI Rank 2:     traceLevel=1
 MPI Rank 2:     SimpleNetworkBuilder=[
@@ -1171,30 +1397,32 @@ MPI Rank 2:           type=Real
 MPI Rank 2:           scpFile=glob_0000.scp
 MPI Rank 2:       ]
 MPI Rank 2:       labels=[
-MPI Rank 2:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf
-MPI Rank 2:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
+MPI Rank 2:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+MPI Rank 2:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
 MPI Rank 2:           labelDim=132
 MPI Rank 2:           labelType=Category
 MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ]
-MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
-MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 2: RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 2: DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+MPI Rank 2: ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
 MPI Rank 2: DeviceId=0
-MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 2: stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 2: 
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: 
 MPI Rank 2: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 MPI Rank 2: configparameters: cntk.config:command=speechTrain
-MPI Rank 2: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+MPI Rank 2: configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\DNN
+MPI Rank 2: configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
 MPI Rank 2: configparameters: cntk.config:deviceId=0
 MPI Rank 2: configparameters: cntk.config:parallelTrain=true
 MPI Rank 2: configparameters: cntk.config:precision=float
-MPI Rank 2: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu
+MPI Rank 2: configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu
 MPI Rank 2: configparameters: cntk.config:speechTrain=[
 MPI Rank 2:     action=train
-MPI Rank 2:     modelPath=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
+MPI Rank 2:     modelPath=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 2:     deviceId=0
 MPI Rank 2:     traceLevel=1
 MPI Rank 2:     SimpleNetworkBuilder=[
@@ -1270,34 +1498,36 @@ MPI Rank 2:           type=Real
 MPI Rank 2:           scpFile=glob_0000.scp
 MPI Rank 2:       ]
 MPI Rank 2:       labels=[
-MPI Rank 2:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf
-MPI Rank 2:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
+MPI Rank 2:           mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+MPI Rank 2:           labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
 MPI Rank 2:           labelDim=132
 MPI Rank 2:           labelType=Category
 MPI Rank 2:       ]
 MPI Rank 2:     ]
 MPI Rank 2: ]
 MPI Rank 2: 
-MPI Rank 2: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151002132028.943306\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
+MPI Rank 2: configparameters: cntk.config:stderr=C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/stderr
 MPI Rank 2: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 MPI Rank 2: command: speechTrain 
 MPI Rank 2: precision = float
+MPI Rank 2: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024141411.953694\Speech\DNN_ParallelNoQuantization@debug_gpu/models/cntkSpeech.dnn
 MPI Rank 2: CNTKCommandTrainInfo: speechTrain : 3
 MPI Rank 2: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
 MPI Rank 2: CNTKCommandTrainBegin: speechTrain
 MPI Rank 2: SimpleNetworkBuilder Using GPU 0
 MPI Rank 2: reading script file glob_0000.scp ... 948 entries
 MPI Rank 2: trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
-MPI Rank 2: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
-MPI Rank 2: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+MPI Rank 2: total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
+MPI Rank 2: htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
 MPI Rank 2: ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 MPI Rank 2: label set 0: 129 classes
 MPI Rank 2: minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+MPI Rank 2: SetUniformRandomValue (GPU): creating curand object with seed 1
 MPI Rank 2: GetTrainCriterionNodes  ...
 MPI Rank 2: GetEvalCriterionNodes  ...
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node CrossEntropyWithSoftmax 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 MPI Rank 2: 
 MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
 MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
@@ -1320,13 +1550,57 @@ MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
 MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
 MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
 MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node CrossEntropyWithSoftmax, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+MPI Rank 2: Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
 MPI Rank 2: 
 MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 2: Found 6 PreCompute nodes
-MPI Rank 2: 	NodeName: InvStdOfFeatures
-MPI Rank 2: 	NodeName: MeanOfFeatures
-MPI Rank 2: 	NodeName: Prior
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Precomputing --> 3 PreCompute nodes found.
+MPI Rank 2: 
 MPI Rank 2: 	NodeName: InvStdOfFeatures
 MPI Rank 2: 	NodeName: MeanOfFeatures
 MPI Rank 2: 	NodeName: Prior
@@ -1334,120 +1608,185 @@ MPI Rank 2: minibatchiterator: epoch 0: frames [0..252734] (first utterance at f
 MPI Rank 2: requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node InvStdOfFeatures 
+MPI Rank 2: Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 2: 
-MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node InvStdOfFeatures, final verification.
 MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 2: 
 MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node MeanOfFeatures 
 MPI Rank 2: 
-MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 64]
-MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 64]) -> [363, 1]
+MPI Rank 2: Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node MeanOfFeatures, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 3]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
 MPI Rank 2: 
 MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
 MPI Rank 2: 
 MPI Rank 2: 
-MPI Rank 2: Validating node Prior 
 MPI Rank 2: 
-MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 64]
-MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 64]) -> [132, 1]
+MPI Rank 2: Validating for node Prior. 2 nodes to process in pass 1.
 MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node Prior. 1 nodes to process in pass 2.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node Prior, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 3]
+MPI Rank 2: Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
 MPI Rank 2: 
 MPI Rank 2: 1 out of 2 nodes do not share the minibatch layout with the input data.
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Precomputing --> Completed.
+MPI Rank 2: 
 MPI Rank 2: Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
+MPI Rank 2: Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000 
 MPI Rank 2: minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 2 of 3, with 1 datapasses
 MPI Rank 2: 
-MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
 MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
 MPI Rank 2: 
-MPI Rank 2: Validating node EvalErrorPrediction 
-MPI Rank 2: 
-MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 16]
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62]
 MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
 MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
 MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
-MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 16]
-MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 16]) -> [363, 1]
-MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 16]) -> [363, 1]
-MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 16], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 16]
-MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 16]) -> [512, MBSize 16]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
-MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 16], B0[512, 1]) -> [512, MBSize 16]
-MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 16]) -> [512, MBSize 16]
-MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 16]) -> [512, MBSize 16]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
 MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
-MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 16], B1[512, 1]) -> [512, MBSize 16]
-MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 16]) -> [512, MBSize 16]
-MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 16]) -> [132, MBSize 16]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
 MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
-MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 16], B2[132, 1]) -> [132, MBSize 16]
-MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 16], HLast[132, MBSize 16]) -> [1, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
 MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+MPI Rank 2: 
+MPI Rank 2: Validating for node EvalErrorPrediction, final verification.
+MPI Rank 2: 
+MPI Rank 2: Validating --> labels = InputValue -> [132, MBSize 62]
+MPI Rank 2: Validating --> W2 = LearnableParameter -> [132, 512]
+MPI Rank 2: Validating --> W1 = LearnableParameter -> [512, 512]
+MPI Rank 2: Validating --> W0 = LearnableParameter -> [512, 363]
+MPI Rank 2: Validating --> features = InputValue -> [363, MBSize 62]
+MPI Rank 2: Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+MPI Rank 2: Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+MPI Rank 2: Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> B0 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> B1 = LearnableParameter -> [512, 1]
+MPI Rank 2: Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+MPI Rank 2: Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> B2 = LearnableParameter -> [132, 1]
+MPI Rank 2: Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+MPI Rank 2: Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 MPI Rank 2: 
 MPI Rank 2: 9 out of 20 nodes do not share the minibatch layout with the input data.
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45646170; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.49987s; TotalTimePerSample = 0.78104ms; SamplesPerSecond = 1280
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315661; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.41635s; TotalTimePerSample = 0.65054ms; SamplesPerSecond = 1537
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180607; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.39403s; TotalTimePerSample = 0.61567ms; SamplesPerSecond = 1624
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94158019; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.40679s; TotalTimePerSample = 0.63560ms; SamplesPerSecond = 1573
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668726; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.38405s; TotalTimePerSample = 0.60008ms; SamplesPerSecond = 1666
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866371; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.38342s; TotalTimePerSample = 0.59910ms; SamplesPerSecond = 1669
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51808934; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.37619s; TotalTimePerSample = 0.58779ms; SamplesPerSecond = 1701
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48455124; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.37204s; TotalTimePerSample = 0.58132ms; SamplesPerSecond = 1720
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829281; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.38168s; TotalTimePerSample = 0.59637ms; SamplesPerSecond = 1676
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167446; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.37763s; TotalTimePerSample = 0.59004ms; SamplesPerSecond = 1694
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861682; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.37533s; TotalTimePerSample = 0.58645ms; SamplesPerSecond = 1705
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32616995; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.37460s; TotalTimePerSample = 0.58531ms; SamplesPerSecond = 1708
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16897953; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.37657s; TotalTimePerSample = 0.58838ms; SamplesPerSecond = 1699
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08892002; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.38202s; TotalTimePerSample = 0.59690ms; SamplesPerSecond = 1675
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004848; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37470s; TotalTimePerSample = 0.58546ms; SamplesPerSecond = 1708
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128321; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.36968s; TotalTimePerSample = 0.57763ms; SamplesPerSecond = 1731
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90171920; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.37682s; TotalTimePerSample = 0.58879ms; SamplesPerSecond = 1698
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73262413; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.37381s; TotalTimePerSample = 0.58408ms; SamplesPerSecond = 1712
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515363; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.38731s; TotalTimePerSample = 0.60517ms; SamplesPerSecond = 1652
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67382489; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.37515s; TotalTimePerSample = 0.58618ms; SamplesPerSecond = 1705
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869718; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37936s; TotalTimePerSample = 0.59275ms; SamplesPerSecond = 1687
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60031970; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.37877s; TotalTimePerSample = 0.59183ms; SamplesPerSecond = 1689
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134087; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.37837s; TotalTimePerSample = 0.59121ms; SamplesPerSecond = 1691
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362164; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.37481s; TotalTimePerSample = 0.58564ms; SamplesPerSecond = 1707
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640677; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.36931s; TotalTimePerSample = 0.57705ms; SamplesPerSecond = 1732
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745369; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.37068s; TotalTimePerSample = 0.57918ms; SamplesPerSecond = 1726
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16416032; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.37472s; TotalTimePerSample = 0.58550ms; SamplesPerSecond = 1707
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30346910; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37953s; TotalTimePerSample = 0.59302ms; SamplesPerSecond = 1686
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398823; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.37227s; TotalTimePerSample = 0.58167ms; SamplesPerSecond = 1719
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322470; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.37312s; TotalTimePerSample = 0.58301ms; SamplesPerSecond = 1715
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664598; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.38484s; TotalTimePerSample = 0.60131ms; SamplesPerSecond = 1663
-MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246635; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.37786s; TotalTimePerSample = 0.59041ms; SamplesPerSecond = 1693
-MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=12.321116
-MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
+MPI Rank 2: 
+MPI Rank 2: 
+MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.35109s; TotalTimePerSample = 0.54858ms; SamplesPerSecond = 1822
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315785; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.28226s; TotalTimePerSample = 0.44103ms; SamplesPerSecond = 2267
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180676; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.25635s; TotalTimePerSample = 0.40055ms; SamplesPerSecond = 2496
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94158071; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.29191s; TotalTimePerSample = 0.45612ms; SamplesPerSecond = 2192
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668763; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.26100s; TotalTimePerSample = 0.40781ms; SamplesPerSecond = 2452
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866399; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.24718s; TotalTimePerSample = 0.38621ms; SamplesPerSecond = 2589
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51808951; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.26306s; TotalTimePerSample = 0.41103ms; SamplesPerSecond = 2432
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48455147; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.25239s; TotalTimePerSample = 0.39436ms; SamplesPerSecond = 2535
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829288; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.23938s; TotalTimePerSample = 0.37404ms; SamplesPerSecond = 2673
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167490; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.24288s; TotalTimePerSample = 0.37949ms; SamplesPerSecond = 2635
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861768; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.23594s; TotalTimePerSample = 0.36866ms; SamplesPerSecond = 2712
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32617094; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.23051s; TotalTimePerSample = 0.36018ms; SamplesPerSecond = 2776
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16898033; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.21907s; TotalTimePerSample = 0.34230ms; SamplesPerSecond = 2921
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08892100; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.21842s; TotalTimePerSample = 0.34128ms; SamplesPerSecond = 2930
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004828; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.23960s; TotalTimePerSample = 0.37438ms; SamplesPerSecond = 2671
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128317; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.32538s; TotalTimePerSample = 0.50841ms; SamplesPerSecond = 1966
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90171901; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.32097s; TotalTimePerSample = 0.50152ms; SamplesPerSecond = 1993
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73262447; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.33493s; TotalTimePerSample = 0.52334ms; SamplesPerSecond = 1910
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515410; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.35758s; TotalTimePerSample = 0.55872ms; SamplesPerSecond = 1789
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67382540; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.32538s; TotalTimePerSample = 0.50840ms; SamplesPerSecond = 1966
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869780; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.36246s; TotalTimePerSample = 0.56634ms; SamplesPerSecond = 1765
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60032086; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.35998s; TotalTimePerSample = 0.56246ms; SamplesPerSecond = 1777
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134188; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.33777s; TotalTimePerSample = 0.52777ms; SamplesPerSecond = 1894
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362252; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.34429s; TotalTimePerSample = 0.53796ms; SamplesPerSecond = 1858
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640740; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.33129s; TotalTimePerSample = 0.51763ms; SamplesPerSecond = 1931
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745478; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.35530s; TotalTimePerSample = 0.55515ms; SamplesPerSecond = 1801
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16416053; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.36414s; TotalTimePerSample = 0.56897ms; SamplesPerSecond = 1757
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30346869; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.37427s; TotalTimePerSample = 0.58480ms; SamplesPerSecond = 1710
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398831; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.36414s; TotalTimePerSample = 0.56897ms; SamplesPerSecond = 1757
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322487; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.35425s; TotalTimePerSample = 0.55351ms; SamplesPerSecond = 1806
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664627; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.34279s; TotalTimePerSample = 0.53561ms; SamplesPerSecond = 1867
+MPI Rank 2:  Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246685; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.33153s; TotalTimePerSample = 0.51802ms; SamplesPerSecond = 1930
+MPI Rank 2: Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000035; EvalErrPerSample = 0.72836914; Ave LearnRatePerSample = 0.015625; EpochTime=9.773289
+MPI Rank 2: Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119 
 MPI Rank 2: minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 2 of 3, with 1 datapasses
 MPI Rank 2: 
 MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151923; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.49122s; TotalTimePerSample = 0.19188ms; SamplesPerSecond = 5211
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395650; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.45895s; TotalTimePerSample = 0.17928ms; SamplesPerSecond = 5577
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575441; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.46993s; TotalTimePerSample = 0.18357ms; SamplesPerSecond = 5447
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90485007; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.47271s; TotalTimePerSample = 0.18465ms; SamplesPerSecond = 5415
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324108; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.48627s; TotalTimePerSample = 0.18995ms; SamplesPerSecond = 5264
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109287; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.48005s; TotalTimePerSample = 0.18752ms; SamplesPerSecond = 5332
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496218; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.45117s; TotalTimePerSample = 0.17624ms; SamplesPerSecond = 5674
-MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944253; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.43751s; TotalTimePerSample = 0.17090ms; SamplesPerSecond = 5851
-MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.770598
-MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151948; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.57966s; TotalTimePerSample = 0.22643ms; SamplesPerSecond = 4416
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395688; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.46937s; TotalTimePerSample = 0.18335ms; SamplesPerSecond = 5454
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575479; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.42189s; TotalTimePerSample = 0.16480ms; SamplesPerSecond = 6067
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90485039; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.33354s; TotalTimePerSample = 0.13029ms; SamplesPerSecond = 7675
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324146; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.31174s; TotalTimePerSample = 0.12177ms; SamplesPerSecond = 8211
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109327; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.29732s; TotalTimePerSample = 0.11614ms; SamplesPerSecond = 8610
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496253; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.32607s; TotalTimePerSample = 0.12737ms; SamplesPerSecond = 7851
+MPI Rank 2:  Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944295; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.34337s; TotalTimePerSample = 0.13413ms; SamplesPerSecond = 7455
+MPI Rank 2: Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356027; EvalErrPerSample = 0.53603516; Ave LearnRatePerSample = 0.001953125; EpochTime=3.101094
+MPI Rank 2: Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
 MPI Rank 2: minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 2 of 3, with 1 datapasses
 MPI Rank 2: 
 MPI Rank 2: Starting minibatch loop, DataParallelSGD training (MyRank = 2, NumNodes = 3, NumGradientBits = 32), Distributed reading is ENABLED.
-MPI Rank 2:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752815; EvalErr[0]PerSample = 0.52177734; TotalTime = 0.96478s; TotalTimePerSample = 0.09422ms; SamplesPerSecond = 10613
-MPI Rank 2:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358797; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.79373s; TotalTimePerSample = 0.07751ms; SamplesPerSecond = 12901
-MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705581; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.792243
+MPI Rank 2:  Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752856; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.03760s; TotalTimePerSample = 0.10133ms; SamplesPerSecond = 9868
+MPI Rank 2:  Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358831; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.82561s; TotalTimePerSample = 0.08063ms; SamplesPerSecond = 12402
+MPI Rank 2: Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.51860352; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.906446
 MPI Rank 2: CNTKCommandTrainEnd: speechTrain
 MPI Rank 2: COMPLETED
 MPI Rank 2: ~MPIWrapper
diff --git a/Tests/Speech/LSTM/FullUtterance/baseline.gpu.txt b/Tests/Speech/LSTM/FullUtterance/baseline.gpu.txt
index 84158c7cc..44f79cf16 100644
--- a/Tests/Speech/LSTM/FullUtterance/baseline.gpu.txt
+++ b/Tests/Speech/LSTM/FullUtterance/baseline.gpu.txt
@@ -1,7 +1,7 @@
-=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../cntk.config RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]]
-running on localhost at 2015/10/01 13:54:42
-command line options: 
-configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../cntk.config RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] 
+=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../cntk.config RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. DeviceId=0 Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]]
+running on localhost at 2015/10/24 12:59:03
+command line: 
+/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../cntk.config RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/.. DeviceId=0 Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
@@ -15,9 +15,6 @@ speechTrain=[
     modelPath=$RunDir$/models/cntkSpeech.dnn
     deviceId=$DeviceId$
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -191,10 +188,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ]
-RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu
+RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu
 DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/..
 DeviceId=0
-NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/..
 Truncated=false
 speechTrain=[reader=[nbruttsineachrecurrentiter=1]]
 speechTrain=[SGD=[epochSize=2560]]
@@ -212,12 +209,9 @@ frameMode=false
 Truncated=true
 speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
+    modelPath=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
     deviceId=0
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -391,10 +385,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ]
-RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu
+RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu
 DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/..
 DeviceId=0
-NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/..
 Truncated=false
 speechTrain=[reader=[nbruttsineachrecurrentiter=1]]
 speechTrain=[SGD=[epochSize=2560]]
@@ -405,21 +399,18 @@ speechTrain=[SGD=[numMBsToShowResult=1]]
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: cntk.config:command=speechTrain
+configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/..
 configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
 configparameters: cntk.config:deviceId=0
 configparameters: cntk.config:frameMode=false
-configparameters: cntk.config:NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/..
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu
+configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu
 configparameters: cntk.config:speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
+    modelPath=/tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
     deviceId=0
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/LSTM/FullUtterance/../lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -598,9 +589,11 @@ configparameters: cntk.config:Truncated=false
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: speechTrain 
 precision = float
-CNTKModelPath: /tmp/cntk-test-20151001135442.141617/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
+CNTKModelPath: /tmp/cntk-test-20151024125903.915765/Speech/LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 2
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 2
 CNTKCommandTrainBegin: speechTrain
-NDLBuilder Using GPU 0
+ExperimentalNetworkBuilder using GPU 0
 reading script file /home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.scp ... 948 entries
 trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
 total 132 state names in state list /home/mluser/src/cplx_master/Tests/Speech/Data/state.list
@@ -608,2124 +601,3382 @@ htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/gl
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
- nodes in the recurrent loops : 
-LSTMoutput1.unnamed174	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.bit	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.unnamed224	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.bit	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.unnamed274	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.bit	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.unnamed174	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.bit	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.unnamed224	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.bit	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.unnamed274	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.bit	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Printing Gradient Computation Node Order ... 
-
-cr[0, 0] = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[0, 0])
-LSTMoutputW[0, 0] = Plus(unnamed283[0, 0], b[132, 1])
-b[132, 1] = LearnableParameter
-unnamed283[0, 0] = Times(W[132, 256], unnamed284[0, 0])
-unnamed284[0, 0] = Scale(expsW[0, 0], LSTMoutput3.output[0, 0])
-LSTMoutput3.output[0, 0] = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[0, 0])
-LSTMoutput3.unnamed275[0, 0] = Scale(LSTMoutput3.expsWmr[0, 0], LSTMoutput3.mt[0, 0])
-LSTMoutput3.mt[0, 0] = ElementTimes(LSTMoutput3.ot[0, 0], LSTMoutput3.unnamed274[0, 0])
-LSTMoutput3.unnamed274[0, 0] = Tanh(LSTMoutput3.ct[0, 0])
-LSTMoutput3.ot[0, 0] = Sigmoid(LSTMoutput3.unnamed271[0, 0])
-LSTMoutput3.unnamed271[0, 0] = Plus(LSTMoutput3.unnamed272[0, 0], LSTMoutput3.Wcoct[0, 0])
-LSTMoutput3.Wcoct[0, 0] = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[0, 0])
-LSTMoutput3.unnamed270[0, 0] = Scale(LSTMoutput3.expsWco[0, 0], LSTMoutput3.ct[0, 0])
-LSTMoutput3.ct[0, 0] = Plus(LSTMoutput3.bft[0, 0], LSTMoutput3.bit[0, 0])
-LSTMoutput3.bit[0, 0] = ElementTimes(LSTMoutput3.it[0, 0], LSTMoutput3.unnamed259[0, 0])
-LSTMoutput3.unnamed259[0, 0] = Tanh(LSTMoutput3.unnamed260[0, 0])
-LSTMoutput3.unnamed260[0, 0] = Plus(LSTMoutput3.Wxcx[0, 0], LSTMoutput3.unnamed261[0, 0])
-LSTMoutput3.unnamed261[0, 0] = Plus(LSTMoutput3.Whcdh[0, 0], LSTMoutput3.bc[1024, 1])
-LSTMoutput3.Whcdh[0, 0] = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[0, 0])
-LSTMoutput3.unnamed258[0, 0] = Scale(LSTMoutput3.expsWhc[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.it[0, 0] = Sigmoid(LSTMoutput3.unnamed254[0, 0])
-LSTMoutput3.unnamed254[0, 0] = Plus(LSTMoutput3.unnamed255[0, 0], LSTMoutput3.Wcidc[0, 0])
-LSTMoutput3.Wcidc[0, 0] = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[0, 0])
-LSTMoutput3.unnamed253[0, 0] = Scale(LSTMoutput3.expsWci[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.unnamed255[0, 0] = Plus(LSTMoutput3.unnamed256[0, 0], LSTMoutput3.Whidh[0, 0])
-LSTMoutput3.Whidh[0, 0] = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[0, 0])
-LSTMoutput3.unnamed252[0, 0] = Scale(LSTMoutput3.expsWhi[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.bft[0, 0] = ElementTimes(LSTMoutput3.ft[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.ft[0, 0] = Sigmoid(LSTMoutput3.unnamed265[0, 0])
-LSTMoutput3.unnamed265[0, 0] = Plus(LSTMoutput3.unnamed266[0, 0], LSTMoutput3.Wcfdc[0, 0])
-LSTMoutput3.Wcfdc[0, 0] = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[0, 0])
-LSTMoutput3.unnamed264[0, 0] = Scale(LSTMoutput3.expsWcf[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.dc[1024, 1] = PastValue(LSTMoutput3.ct[0, 0])
-LSTMoutput3.unnamed266[0, 0] = Plus(LSTMoutput3.unnamed267[0, 0], LSTMoutput3.Whfdh[0, 0])
-LSTMoutput3.Whfdh[0, 0] = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[0, 0])
-LSTMoutput3.unnamed263[0, 0] = Scale(LSTMoutput3.expsWhf[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.unnamed272[0, 0] = Plus(LSTMoutput3.unnamed273[0, 0], LSTMoutput3.Whodh[0, 0])
-LSTMoutput3.Whodh[0, 0] = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[0, 0])
-LSTMoutput3.unnamed269[0, 0] = Scale(LSTMoutput3.expsWho[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.dh[256, 1] = PastValue(LSTMoutput3.output[0, 0])
-LSTMoutput3.bc[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhc[0, 0] = Exp(LSTMoutput3.sWhc[1, 1])
-LSTMoutput3.sWhc[1, 1] = LearnableParameter
-LSTMoutput3.Whc[1024, 256] = LearnableParameter
-LSTMoutput3.Wxcx[0, 0] = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[0, 0])
-LSTMoutput3.unnamed257[0, 0] = Scale(LSTMoutput3.expsWxc[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxc[0, 0] = Exp(LSTMoutput3.sWxc[1, 1])
-LSTMoutput3.sWxc[1, 1] = LearnableParameter
-LSTMoutput3.Wxc[1024, 256] = LearnableParameter
-LSTMoutput3.expsWci[0, 0] = Exp(LSTMoutput3.sWci[1, 1])
-LSTMoutput3.sWci[1, 1] = LearnableParameter
-LSTMoutput3.Wci[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhi[0, 0] = Exp(LSTMoutput3.sWhi[1, 1])
-LSTMoutput3.sWhi[1, 1] = LearnableParameter
-LSTMoutput3.Whi[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed256[0, 0] = Plus(LSTMoutput3.Wxix[0, 0], LSTMoutput3.bi[1024, 1])
-LSTMoutput3.bi[1024, 1] = LearnableParameter
-LSTMoutput3.Wxix[0, 0] = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[0, 0])
-LSTMoutput3.unnamed251[0, 0] = Scale(LSTMoutput3.expsWxi[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxi[0, 0] = Exp(LSTMoutput3.sWxi[1, 1])
-LSTMoutput3.sWxi[1, 1] = LearnableParameter
-LSTMoutput3.Wxi[1024, 256] = LearnableParameter
-LSTMoutput3.expsWcf[0, 0] = Exp(LSTMoutput3.sWcf[1, 1])
-LSTMoutput3.sWcf[1, 1] = LearnableParameter
-LSTMoutput3.Wcf[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhf[0, 0] = Exp(LSTMoutput3.sWhf[1, 1])
-LSTMoutput3.sWhf[1, 1] = LearnableParameter
-LSTMoutput3.Whf[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed267[0, 0] = Plus(LSTMoutput3.Wxfx[0, 0], LSTMoutput3.bf[1024, 1])
-LSTMoutput3.bf[1024, 1] = LearnableParameter
-LSTMoutput3.Wxfx[0, 0] = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[0, 0])
-LSTMoutput3.unnamed262[0, 0] = Scale(LSTMoutput3.expsWxf[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxf[0, 0] = Exp(LSTMoutput3.sWxf[1, 1])
-LSTMoutput3.sWxf[1, 1] = LearnableParameter
-LSTMoutput3.Wxf[1024, 256] = LearnableParameter
-LSTMoutput3.expsWco[0, 0] = Exp(LSTMoutput3.sWco[1, 1])
-LSTMoutput3.sWco[1, 1] = LearnableParameter
-LSTMoutput3.Wco[1024, 1] = LearnableParameter
-LSTMoutput3.expsWho[0, 0] = Exp(LSTMoutput3.sWho[1, 1])
-LSTMoutput3.sWho[1, 1] = LearnableParameter
-LSTMoutput3.Who[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed273[0, 0] = Plus(LSTMoutput3.Wxox[0, 0], LSTMoutput3.bo[1024, 1])
-LSTMoutput3.bo[1024, 1] = LearnableParameter
-LSTMoutput3.Wxox[0, 0] = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[0, 0])
-LSTMoutput3.unnamed268[0, 0] = Scale(LSTMoutput3.expsWxo[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput2.output[0, 0] = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[0, 0])
-LSTMoutput2.unnamed225[0, 0] = Scale(LSTMoutput2.expsWmr[0, 0], LSTMoutput2.mt[0, 0])
-LSTMoutput2.mt[0, 0] = ElementTimes(LSTMoutput2.ot[0, 0], LSTMoutput2.unnamed224[0, 0])
-LSTMoutput2.unnamed224[0, 0] = Tanh(LSTMoutput2.ct[0, 0])
-LSTMoutput2.ot[0, 0] = Sigmoid(LSTMoutput2.unnamed221[0, 0])
-LSTMoutput2.unnamed221[0, 0] = Plus(LSTMoutput2.unnamed222[0, 0], LSTMoutput2.Wcoct[0, 0])
-LSTMoutput2.Wcoct[0, 0] = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[0, 0])
-LSTMoutput2.unnamed220[0, 0] = Scale(LSTMoutput2.expsWco[0, 0], LSTMoutput2.ct[0, 0])
-LSTMoutput2.ct[0, 0] = Plus(LSTMoutput2.bft[0, 0], LSTMoutput2.bit[0, 0])
-LSTMoutput2.bit[0, 0] = ElementTimes(LSTMoutput2.it[0, 0], LSTMoutput2.unnamed209[0, 0])
-LSTMoutput2.unnamed209[0, 0] = Tanh(LSTMoutput2.unnamed210[0, 0])
-LSTMoutput2.unnamed210[0, 0] = Plus(LSTMoutput2.Wxcx[0, 0], LSTMoutput2.unnamed211[0, 0])
-LSTMoutput2.unnamed211[0, 0] = Plus(LSTMoutput2.Whcdh[0, 0], LSTMoutput2.bc[1024, 1])
-LSTMoutput2.Whcdh[0, 0] = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[0, 0])
-LSTMoutput2.unnamed208[0, 0] = Scale(LSTMoutput2.expsWhc[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.it[0, 0] = Sigmoid(LSTMoutput2.unnamed204[0, 0])
-LSTMoutput2.unnamed204[0, 0] = Plus(LSTMoutput2.unnamed205[0, 0], LSTMoutput2.Wcidc[0, 0])
-LSTMoutput2.Wcidc[0, 0] = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[0, 0])
-LSTMoutput2.unnamed203[0, 0] = Scale(LSTMoutput2.expsWci[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.unnamed205[0, 0] = Plus(LSTMoutput2.unnamed206[0, 0], LSTMoutput2.Whidh[0, 0])
-LSTMoutput2.Whidh[0, 0] = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[0, 0])
-LSTMoutput2.unnamed202[0, 0] = Scale(LSTMoutput2.expsWhi[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.bft[0, 0] = ElementTimes(LSTMoutput2.ft[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.ft[0, 0] = Sigmoid(LSTMoutput2.unnamed215[0, 0])
-LSTMoutput2.unnamed215[0, 0] = Plus(LSTMoutput2.unnamed216[0, 0], LSTMoutput2.Wcfdc[0, 0])
-LSTMoutput2.Wcfdc[0, 0] = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[0, 0])
-LSTMoutput2.unnamed214[0, 0] = Scale(LSTMoutput2.expsWcf[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.dc[1024, 1] = PastValue(LSTMoutput2.ct[0, 0])
-LSTMoutput2.unnamed216[0, 0] = Plus(LSTMoutput2.unnamed217[0, 0], LSTMoutput2.Whfdh[0, 0])
-LSTMoutput2.Whfdh[0, 0] = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[0, 0])
-LSTMoutput2.unnamed213[0, 0] = Scale(LSTMoutput2.expsWhf[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.unnamed222[0, 0] = Plus(LSTMoutput2.unnamed223[0, 0], LSTMoutput2.Whodh[0, 0])
-LSTMoutput2.Whodh[0, 0] = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[0, 0])
-LSTMoutput2.unnamed219[0, 0] = Scale(LSTMoutput2.expsWho[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.dh[256, 1] = PastValue(LSTMoutput2.output[0, 0])
-LSTMoutput2.bc[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhc[0, 0] = Exp(LSTMoutput2.sWhc[1, 1])
-LSTMoutput2.sWhc[1, 1] = LearnableParameter
-LSTMoutput2.Whc[1024, 256] = LearnableParameter
-LSTMoutput2.Wxcx[0, 0] = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[0, 0])
-LSTMoutput2.unnamed207[0, 0] = Scale(LSTMoutput2.expsWxc[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxc[0, 0] = Exp(LSTMoutput2.sWxc[1, 1])
-LSTMoutput2.sWxc[1, 1] = LearnableParameter
-LSTMoutput2.Wxc[1024, 256] = LearnableParameter
-LSTMoutput2.expsWci[0, 0] = Exp(LSTMoutput2.sWci[1, 1])
-LSTMoutput2.sWci[1, 1] = LearnableParameter
-LSTMoutput2.Wci[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhi[0, 0] = Exp(LSTMoutput2.sWhi[1, 1])
-LSTMoutput2.sWhi[1, 1] = LearnableParameter
-LSTMoutput2.Whi[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed206[0, 0] = Plus(LSTMoutput2.Wxix[0, 0], LSTMoutput2.bi[1024, 1])
-LSTMoutput2.bi[1024, 1] = LearnableParameter
-LSTMoutput2.Wxix[0, 0] = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[0, 0])
-LSTMoutput2.unnamed201[0, 0] = Scale(LSTMoutput2.expsWxi[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxi[0, 0] = Exp(LSTMoutput2.sWxi[1, 1])
-LSTMoutput2.sWxi[1, 1] = LearnableParameter
-LSTMoutput2.Wxi[1024, 256] = LearnableParameter
-LSTMoutput2.expsWcf[0, 0] = Exp(LSTMoutput2.sWcf[1, 1])
-LSTMoutput2.sWcf[1, 1] = LearnableParameter
-LSTMoutput2.Wcf[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhf[0, 0] = Exp(LSTMoutput2.sWhf[1, 1])
-LSTMoutput2.sWhf[1, 1] = LearnableParameter
-LSTMoutput2.Whf[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed217[0, 0] = Plus(LSTMoutput2.Wxfx[0, 0], LSTMoutput2.bf[1024, 1])
-LSTMoutput2.bf[1024, 1] = LearnableParameter
-LSTMoutput2.Wxfx[0, 0] = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[0, 0])
-LSTMoutput2.unnamed212[0, 0] = Scale(LSTMoutput2.expsWxf[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxf[0, 0] = Exp(LSTMoutput2.sWxf[1, 1])
-LSTMoutput2.sWxf[1, 1] = LearnableParameter
-LSTMoutput2.Wxf[1024, 256] = LearnableParameter
-LSTMoutput2.expsWco[0, 0] = Exp(LSTMoutput2.sWco[1, 1])
-LSTMoutput2.sWco[1, 1] = LearnableParameter
-LSTMoutput2.Wco[1024, 1] = LearnableParameter
-LSTMoutput2.expsWho[0, 0] = Exp(LSTMoutput2.sWho[1, 1])
-LSTMoutput2.sWho[1, 1] = LearnableParameter
-LSTMoutput2.Who[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed223[0, 0] = Plus(LSTMoutput2.Wxox[0, 0], LSTMoutput2.bo[1024, 1])
-LSTMoutput2.bo[1024, 1] = LearnableParameter
-LSTMoutput2.Wxox[0, 0] = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[0, 0])
-LSTMoutput2.unnamed218[0, 0] = Scale(LSTMoutput2.expsWxo[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput1.output[0, 0] = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[0, 0])
-LSTMoutput1.unnamed175[0, 0] = Scale(LSTMoutput1.expsWmr[0, 0], LSTMoutput1.mt[0, 0])
-LSTMoutput1.mt[0, 0] = ElementTimes(LSTMoutput1.ot[0, 0], LSTMoutput1.unnamed174[0, 0])
-LSTMoutput1.unnamed174[0, 0] = Tanh(LSTMoutput1.ct[0, 0])
-LSTMoutput1.ot[0, 0] = Sigmoid(LSTMoutput1.unnamed171[0, 0])
-LSTMoutput1.unnamed171[0, 0] = Plus(LSTMoutput1.unnamed172[0, 0], LSTMoutput1.Wcoct[0, 0])
-LSTMoutput1.Wcoct[0, 0] = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[0, 0])
-LSTMoutput1.unnamed170[0, 0] = Scale(LSTMoutput1.expsWco[0, 0], LSTMoutput1.ct[0, 0])
-LSTMoutput1.ct[0, 0] = Plus(LSTMoutput1.bft[0, 0], LSTMoutput1.bit[0, 0])
-LSTMoutput1.bit[0, 0] = ElementTimes(LSTMoutput1.it[0, 0], LSTMoutput1.unnamed159[0, 0])
-LSTMoutput1.unnamed159[0, 0] = Tanh(LSTMoutput1.unnamed160[0, 0])
-LSTMoutput1.unnamed160[0, 0] = Plus(LSTMoutput1.Wxcx[0, 0], LSTMoutput1.unnamed161[0, 0])
-LSTMoutput1.unnamed161[0, 0] = Plus(LSTMoutput1.Whcdh[0, 0], LSTMoutput1.bc[1024, 1])
-LSTMoutput1.Whcdh[0, 0] = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[0, 0])
-LSTMoutput1.unnamed158[0, 0] = Scale(LSTMoutput1.expsWhc[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.it[0, 0] = Sigmoid(LSTMoutput1.unnamed154[0, 0])
-LSTMoutput1.unnamed154[0, 0] = Plus(LSTMoutput1.unnamed155[0, 0], LSTMoutput1.Wcidc[0, 0])
-LSTMoutput1.Wcidc[0, 0] = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[0, 0])
-LSTMoutput1.unnamed153[0, 0] = Scale(LSTMoutput1.expsWci[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.unnamed155[0, 0] = Plus(LSTMoutput1.unnamed156[0, 0], LSTMoutput1.Whidh[0, 0])
-LSTMoutput1.Whidh[0, 0] = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[0, 0])
-LSTMoutput1.unnamed152[0, 0] = Scale(LSTMoutput1.expsWhi[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.bft[0, 0] = ElementTimes(LSTMoutput1.ft[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.ft[0, 0] = Sigmoid(LSTMoutput1.unnamed165[0, 0])
-LSTMoutput1.unnamed165[0, 0] = Plus(LSTMoutput1.unnamed166[0, 0], LSTMoutput1.Wcfdc[0, 0])
-LSTMoutput1.Wcfdc[0, 0] = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[0, 0])
-LSTMoutput1.unnamed164[0, 0] = Scale(LSTMoutput1.expsWcf[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.dc[1024, 1] = PastValue(LSTMoutput1.ct[0, 0])
-LSTMoutput1.unnamed166[0, 0] = Plus(LSTMoutput1.unnamed167[0, 0], LSTMoutput1.Whfdh[0, 0])
-LSTMoutput1.Whfdh[0, 0] = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[0, 0])
-LSTMoutput1.unnamed163[0, 0] = Scale(LSTMoutput1.expsWhf[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.unnamed172[0, 0] = Plus(LSTMoutput1.unnamed173[0, 0], LSTMoutput1.Whodh[0, 0])
-LSTMoutput1.Whodh[0, 0] = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[0, 0])
-LSTMoutput1.unnamed169[0, 0] = Scale(LSTMoutput1.expsWho[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.dh[256, 1] = PastValue(LSTMoutput1.output[0, 0])
-LSTMoutput1.bc[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhc[0, 0] = Exp(LSTMoutput1.sWhc[1, 1])
-LSTMoutput1.sWhc[1, 1] = LearnableParameter
-LSTMoutput1.Whc[1024, 256] = LearnableParameter
-LSTMoutput1.Wxcx[0, 0] = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[0, 0])
-LSTMoutput1.unnamed157[0, 0] = Scale(LSTMoutput1.expsWxc[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxc[0, 0] = Exp(LSTMoutput1.sWxc[1, 1])
-LSTMoutput1.sWxc[1, 1] = LearnableParameter
-LSTMoutput1.Wxc[1024, 33] = LearnableParameter
-LSTMoutput1.expsWci[0, 0] = Exp(LSTMoutput1.sWci[1, 1])
-LSTMoutput1.sWci[1, 1] = LearnableParameter
-LSTMoutput1.Wci[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhi[0, 0] = Exp(LSTMoutput1.sWhi[1, 1])
-LSTMoutput1.sWhi[1, 1] = LearnableParameter
-LSTMoutput1.Whi[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed156[0, 0] = Plus(LSTMoutput1.Wxix[0, 0], LSTMoutput1.bi[1024, 1])
-LSTMoutput1.bi[1024, 1] = LearnableParameter
-LSTMoutput1.Wxix[0, 0] = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[0, 0])
-LSTMoutput1.unnamed151[0, 0] = Scale(LSTMoutput1.expsWxi[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxi[0, 0] = Exp(LSTMoutput1.sWxi[1, 1])
-LSTMoutput1.sWxi[1, 1] = LearnableParameter
-LSTMoutput1.Wxi[1024, 33] = LearnableParameter
-LSTMoutput1.expsWcf[0, 0] = Exp(LSTMoutput1.sWcf[1, 1])
-LSTMoutput1.sWcf[1, 1] = LearnableParameter
-LSTMoutput1.Wcf[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhf[0, 0] = Exp(LSTMoutput1.sWhf[1, 1])
-LSTMoutput1.sWhf[1, 1] = LearnableParameter
-LSTMoutput1.Whf[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed167[0, 0] = Plus(LSTMoutput1.Wxfx[0, 0], LSTMoutput1.bf[1024, 1])
-LSTMoutput1.bf[1024, 1] = LearnableParameter
-LSTMoutput1.Wxfx[0, 0] = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[0, 0])
-LSTMoutput1.unnamed162[0, 0] = Scale(LSTMoutput1.expsWxf[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxf[0, 0] = Exp(LSTMoutput1.sWxf[1, 1])
-LSTMoutput1.sWxf[1, 1] = LearnableParameter
-LSTMoutput1.Wxf[1024, 33] = LearnableParameter
-LSTMoutput1.expsWco[0, 0] = Exp(LSTMoutput1.sWco[1, 1])
-LSTMoutput1.sWco[1, 1] = LearnableParameter
-LSTMoutput1.Wco[1024, 1] = LearnableParameter
-LSTMoutput1.expsWho[0, 0] = Exp(LSTMoutput1.sWho[1, 1])
-LSTMoutput1.sWho[1, 1] = LearnableParameter
-LSTMoutput1.Who[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed173[0, 0] = Plus(LSTMoutput1.Wxox[0, 0], LSTMoutput1.bo[1024, 1])
-LSTMoutput1.bo[1024, 1] = LearnableParameter
-LSTMoutput1.Wxox[0, 0] = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[0, 0])
-LSTMoutput1.unnamed168[0, 0] = Scale(LSTMoutput1.expsWxo[0, 0], featNorm.xNorm[0, 0])
-featNorm.xNorm[0, 0] = PerDimMeanVarNormalization(feashift[0, 0], featNorm.xMean[0, 0], featNorm.xStdDev[0, 0])
-featNorm.xStdDev[0, 0] = InvStdDev(feashift[0, 0])
-featNorm.xMean[0, 0] = Mean(feashift[0, 0])
-feashift[0, 0] = RowSlice(features[363, 1])
-features[363, 1] = InputValue
-LSTMoutput1.expsWxo[0, 0] = Exp(LSTMoutput1.sWxo[1, 1])
-LSTMoutput1.sWxo[1, 1] = LearnableParameter
-LSTMoutput1.Wxo[1024, 33] = LearnableParameter
-LSTMoutput1.expsWmr[0, 0] = Exp(LSTMoutput1.sWmr[1, 1])
-LSTMoutput1.sWmr[1, 1] = LearnableParameter
-LSTMoutput1.Wmr[256, 1024] = LearnableParameter
-LSTMoutput2.expsWxo[0, 0] = Exp(LSTMoutput2.sWxo[1, 1])
-LSTMoutput2.sWxo[1, 1] = LearnableParameter
-LSTMoutput2.Wxo[1024, 256] = LearnableParameter
-LSTMoutput2.expsWmr[0, 0] = Exp(LSTMoutput2.sWmr[1, 1])
-LSTMoutput2.sWmr[1, 1] = LearnableParameter
-LSTMoutput2.Wmr[256, 1024] = LearnableParameter
-LSTMoutput3.expsWxo[0, 0] = Exp(LSTMoutput3.sWxo[1, 1])
-LSTMoutput3.sWxo[1, 1] = LearnableParameter
-LSTMoutput3.Wxo[1024, 256] = LearnableParameter
-LSTMoutput3.expsWmr[0, 0] = Exp(LSTMoutput3.sWmr[1, 1])
-LSTMoutput3.sWmr[1, 1] = LearnableParameter
-LSTMoutput3.Wmr[256, 1024] = LearnableParameter
-expsW[0, 0] = Exp(sW[1, 1])
-sW[1, 1] = LearnableParameter
-W[132, 256] = LearnableParameter
-labels[132, 1] = InputValue
-
-Validating node cr 
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node cr 
-
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [256, 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [256, 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [256, 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [256, 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256 {W=6854767804416, H=4294967295, C=0}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [256, 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [256, 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [256, 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [256, 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256 {W=4121110508178779188, H=3474281011731246130, C=4116338653636141100}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [256, 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [256, 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [256, 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [256, 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256 {W=3186069275638247986, H=3760560802616062240, C=3900165879994916908}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-127 out of 272 nodes do not share the minibatch layout with the input data.
-
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-127 out of 272 nodes do not share the minibatch layout with the input data.
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node ScaledLogLikelihood 
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node ScaledLogLikelihood 
-
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> logPrior.Prior = Mean(labels[132, MBSize 1]) -> [132, 1]
-Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -> [132, 1]
-Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, MBSize 1], logPrior.LogPrior[132, 1]) -> [132, MBSize 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-128 out of 274 nodes do not share the minibatch layout with the input data.
-
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> logPrior.Prior = Mean(labels[132, MBSize 1]) -> [132, 1]
-Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -> [132, 1]
-Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, MBSize 1], logPrior.LogPrior[132, 1]) -> [132, MBSize 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-128 out of 274 nodes do not share the minibatch layout with the input data.
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node Err 
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node Err 
-
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-127 out of 272 nodes do not share the minibatch layout with the input data.
-
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.bit[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3080, H=1, C=1480711769}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.bit[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.bit[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3080, H=1, C=1480711769}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3184080310709005360, H=3467820298285101088, C=2318280822927401004}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=2318286414773236018, H=3184080310709005360, C=3904937726200394016}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-127 out of 272 nodes do not share the minibatch layout with the input data.
+Node --> B = LearnableParameter
+Node --> labels = InputValue
+Node --> LSTMoutputW./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].Wmr = LearnableParameter
+Node --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].Wmr = LearnableParameter
+Node --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].Wmr = LearnableParameter
+Node --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> features = InputValue
+Node --> feashift = RowSlice
+Node --> featNorm.meanVector = Mean
+Node --> featNorm.invStdDevVector = InvStdDev
+Node --> featNorm = PerDimMeanVarNormalization
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].dh = PastValue
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].ot.z./*+*/left = Plus
+Node --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].ft.z./*+*/left = Plus
+Node --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].dc = PastValue
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].ft.z = Plus
+Node --> LSTMoutput[1].ft = Sigmoid
+Node --> LSTMoutput[1].bft = ElementTimes
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].it.z./*+*/left = Plus
+Node --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].it.z = Plus
+Node --> LSTMoutput[1].it = Sigmoid
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[1].bit./*.**/right.z = Plus
+Node --> LSTMoutput[1].bit./*.**/right = Tanh
+Node --> LSTMoutput[1].bit = ElementTimes
+Node --> LSTMoutput[1].ct = Plus
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].ot.z = Plus
+Node --> LSTMoutput[1].ot = Sigmoid
+Node --> LSTMoutput[1].mt./*.**/right = Tanh
+Node --> LSTMoutput[1].mt = ElementTimes
+Node --> LSTMoutput[1].output./***/right = Scale
+Node --> LSTMoutput[1].output = Times
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].dh = PastValue
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].ot.z./*+*/left = Plus
+Node --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].ft.z./*+*/left = Plus
+Node --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].dc = PastValue
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].ft.z = Plus
+Node --> LSTMoutput[2].ft = Sigmoid
+Node --> LSTMoutput[2].bft = ElementTimes
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].it.z./*+*/left = Plus
+Node --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].it.z = Plus
+Node --> LSTMoutput[2].it = Sigmoid
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[2].bit./*.**/right.z = Plus
+Node --> LSTMoutput[2].bit./*.**/right = Tanh
+Node --> LSTMoutput[2].bit = ElementTimes
+Node --> LSTMoutput[2].ct = Plus
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].ot.z = Plus
+Node --> LSTMoutput[2].ot = Sigmoid
+Node --> LSTMoutput[2].mt./*.**/right = Tanh
+Node --> LSTMoutput[2].mt = ElementTimes
+Node --> LSTMoutput[2].output./***/right = Scale
+Node --> LSTMoutput[2].output = Times
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].dh = PastValue
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].ot.z./*+*/left = Plus
+Node --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].ft.z./*+*/left = Plus
+Node --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].dc = PastValue
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].ft.z = Plus
+Node --> LSTMoutput[3].ft = Sigmoid
+Node --> LSTMoutput[3].bft = ElementTimes
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].it.z./*+*/left = Plus
+Node --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].it.z = Plus
+Node --> LSTMoutput[3].it = Sigmoid
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[3].bit./*.**/right.z = Plus
+Node --> LSTMoutput[3].bit./*.**/right = Tanh
+Node --> LSTMoutput[3].bit = ElementTimes
+Node --> LSTMoutput[3].ct = Plus
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].ot.z = Plus
+Node --> LSTMoutput[3].ot = Sigmoid
+Node --> LSTMoutput[3].mt./*.**/right = Tanh
+Node --> LSTMoutput[3].mt = ElementTimes
+Node --> LSTMoutput[3].output./***/right = Scale
+Node --> LSTMoutput[3].output = Times
+Node --> LSTMoutputW./*+*/left./***/right = Scale
+Node --> LSTMoutputW./*+*/left = Times
+Node --> LSTMoutputW = Plus
+Node --> Err = ErrorPrediction
+Node --> logPrior.x = Mean
+Node --> logPrior = Log
+Node --> ScaledLogLikelihood = Minus
+Node --> cr = CrossEntropyWithSoftmax
+N9Microsoft3MSR4CNTK18ComputationNetworkE [
+  B : LearnableParameter 132 x 1 ()
+  cr : CrossEntropyWithSoftmax 0 x 0 (
+    labels
+    LSTMoutputW
+  )
+  Err : ErrorPrediction 0 x 0 (
+    labels
+    LSTMoutputW
+  )
+  feashift : RowSlice 0 x 0 (
+    features
+  )
+  featNorm : PerDimMeanVarNormalization 0 x 0 (
+    feashift
+    featNorm.meanVector
+    featNorm.invStdDevVector
+  )
+  featNorm.invStdDevVector : InvStdDev 0 x 0 (
+    feashift
+  )
+  featNorm.meanVector : Mean 0 x 0 (
+    feashift
+  )
+  features : InputValue 363 x 1 ()
+  labels : InputValue 132 x 1 ()
+  logPrior : Log 0 x 0 (
+    logPrior.x
+  )
+  logPrior.x : Mean 0 x 0 (
+    labels
+  )
+  LSTMoutput[1].bft : ElementTimes 0 x 0 (
+    LSTMoutput[1].ft
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].bit : ElementTimes 0 x 0 (
+    LSTMoutput[1].it
+    LSTMoutput[1].bit./*.**/right
+  )
+  LSTMoutput[1].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z
+  )
+  LSTMoutput[1].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ct : Plus 0 x 0 (
+    LSTMoutput[1].bft
+    LSTMoutput[1].bit
+  )
+  LSTMoutput[1].dc : PastValue 1024 x 1 (
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].dh : PastValue 256 x 1 (
+    LSTMoutput[1].output
+  )
+  LSTMoutput[1].ft : Sigmoid 0 x 0 (
+    LSTMoutput[1].ft.z
+  )
+  LSTMoutput[1].ft.z : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left
+    LSTMoutput[1].ft.z./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it : Sigmoid 0 x 0 (
+    LSTMoutput[1].it.z
+  )
+  LSTMoutput[1].it.z : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left
+    LSTMoutput[1].it.z./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left
+    LSTMoutput[1].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].it.z./*+*/right.matrix
+  )
+  LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].mt : ElementTimes 0 x 0 (
+    LSTMoutput[1].ot
+    LSTMoutput[1].mt./*.**/right
+  )
+  LSTMoutput[1].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].ot : Sigmoid 0 x 0 (
+    LSTMoutput[1].ot.z
+  )
+  LSTMoutput[1].ot.z : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left
+    LSTMoutput[1].ot.z./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].output : Times 0 x 0 (
+    LSTMoutput[1].Wmr
+    LSTMoutput[1].output./***/right
+  )
+  LSTMoutput[1].output./***/right : Scale 0 x 0 (
+    LSTMoutput[1].output./***/right.scalarScalingFactor
+    LSTMoutput[1].mt
+  )
+  LSTMoutput[1].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutput[2].bft : ElementTimes 0 x 0 (
+    LSTMoutput[2].ft
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].bit : ElementTimes 0 x 0 (
+    LSTMoutput[2].it
+    LSTMoutput[2].bit./*.**/right
+  )
+  LSTMoutput[2].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z
+  )
+  LSTMoutput[2].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ct : Plus 0 x 0 (
+    LSTMoutput[2].bft
+    LSTMoutput[2].bit
+  )
+  LSTMoutput[2].dc : PastValue 1024 x 1 (
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].dh : PastValue 256 x 1 (
+    LSTMoutput[2].output
+  )
+  LSTMoutput[2].ft : Sigmoid 0 x 0 (
+    LSTMoutput[2].ft.z
+  )
+  LSTMoutput[2].ft.z : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left
+    LSTMoutput[2].ft.z./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it : Sigmoid 0 x 0 (
+    LSTMoutput[2].it.z
+  )
+  LSTMoutput[2].it.z : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left
+    LSTMoutput[2].it.z./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left
+    LSTMoutput[2].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].it.z./*+*/right.matrix
+  )
+  LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].mt : ElementTimes 0 x 0 (
+    LSTMoutput[2].ot
+    LSTMoutput[2].mt./*.**/right
+  )
+  LSTMoutput[2].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].ot : Sigmoid 0 x 0 (
+    LSTMoutput[2].ot.z
+  )
+  LSTMoutput[2].ot.z : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left
+    LSTMoutput[2].ot.z./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].output : Times 0 x 0 (
+    LSTMoutput[2].Wmr
+    LSTMoutput[2].output./***/right
+  )
+  LSTMoutput[2].output./***/right : Scale 0 x 0 (
+    LSTMoutput[2].output./***/right.scalarScalingFactor
+    LSTMoutput[2].mt
+  )
+  LSTMoutput[2].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutput[3].bft : ElementTimes 0 x 0 (
+    LSTMoutput[3].ft
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].bit : ElementTimes 0 x 0 (
+    LSTMoutput[3].it
+    LSTMoutput[3].bit./*.**/right
+  )
+  LSTMoutput[3].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z
+  )
+  LSTMoutput[3].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ct : Plus 0 x 0 (
+    LSTMoutput[3].bft
+    LSTMoutput[3].bit
+  )
+  LSTMoutput[3].dc : PastValue 1024 x 1 (
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].dh : PastValue 256 x 1 (
+    LSTMoutput[3].output
+  )
+  LSTMoutput[3].ft : Sigmoid 0 x 0 (
+    LSTMoutput[3].ft.z
+  )
+  LSTMoutput[3].ft.z : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left
+    LSTMoutput[3].ft.z./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it : Sigmoid 0 x 0 (
+    LSTMoutput[3].it.z
+  )
+  LSTMoutput[3].it.z : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left
+    LSTMoutput[3].it.z./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left
+    LSTMoutput[3].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].it.z./*+*/right.matrix
+  )
+  LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].mt : ElementTimes 0 x 0 (
+    LSTMoutput[3].ot
+    LSTMoutput[3].mt./*.**/right
+  )
+  LSTMoutput[3].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].ot : Sigmoid 0 x 0 (
+    LSTMoutput[3].ot.z
+  )
+  LSTMoutput[3].ot.z : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left
+    LSTMoutput[3].ot.z./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].output : Times 0 x 0 (
+    LSTMoutput[3].Wmr
+    LSTMoutput[3].output./***/right
+  )
+  LSTMoutput[3].output./***/right : Scale 0 x 0 (
+    LSTMoutput[3].output./***/right.scalarScalingFactor
+    LSTMoutput[3].mt
+  )
+  LSTMoutput[3].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutputW : Plus 0 x 0 (
+    LSTMoutputW./*+*/left
+    B
+  )
+  LSTMoutputW./*+*/left : Times 0 x 0 (
+    LSTMoutputW./*+*/left./***/left
+    LSTMoutputW./*+*/left./***/right
+  )
+  LSTMoutputW./*+*/left./***/left : LearnableParameter 132 x 256 ()
+  LSTMoutputW./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutputW./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[3].output
+  )
+  LSTMoutputW./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  ScaledLogLikelihood : Minus 0 x 0 (
+    LSTMoutputW
+    logPrior
+  )
+]
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
-Found 6 PreCompute nodes
-	NodeName: featNorm.xMean
-	NodeName: featNorm.xStdDev
-	NodeName: logPrior.Prior
-	NodeName: featNorm.xMean
-	NodeName: featNorm.xStdDev
-	NodeName: logPrior.Prior
+ nodes in the recurrent loops : 
+LSTMoutput[1].mt./*.**/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].mt./*.**/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].mt./*.**/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].mt./*.**/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].mt./*.**/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].mt./*.**/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
+
+Validating for node cr. 272 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr. 183 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr. 60 nodes to process in pass 3.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+127 out of 272 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> 3 PreCompute nodes found.
+
+	NodeName: featNorm.invStdDevVector
+	NodeName: featNorm.meanVector
+	NodeName: logPrior.x
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node featNorm.xMean 
+Validating for node featNorm.invStdDevVector. 3 nodes to process in pass 1.
 
-Validating --> features = InputValue -> [363, MBSize 348]
-Validating --> feashift = RowSlice(features[363, MBSize 348]) -> [33, MBSize 348]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 348]) -> [33, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
 
+Validating for node featNorm.invStdDevVector, final verification.
 
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
 
 1 out of 3 nodes do not share the minibatch layout with the input data.
+
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node featNorm.xStdDev 
+Validating for node featNorm.meanVector. 3 nodes to process in pass 1.
 
-Validating --> features = InputValue -> [363, MBSize 348]
-Validating --> feashift = RowSlice(features[363, MBSize 348]) -> [33, MBSize 348]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 348]) -> [33, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
 
+Validating for node featNorm.meanVector, final verification.
 
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
 
 1 out of 3 nodes do not share the minibatch layout with the input data.
+
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node logPrior.Prior 
+Validating for node logPrior.x. 2 nodes to process in pass 1.
 
-Validating --> labels = InputValue -> [132, MBSize 348]
-Validating --> logPrior.Prior = Mean(labels[132, MBSize 348]) -> [132, 1]
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
 
+Validating for node logPrior.x. 1 nodes to process in pass 2.
 
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
+
+Validating for node logPrior.x, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
 
 1 out of 2 nodes do not share the minibatch layout with the input data.
+
+EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+
+Precomputing --> Completed.
+
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.025000  momentum = 0.000000 
+Starting Epoch 1: learning rate per sample = 0.025000  effective momentum = 0.000000 
 minibatchiterator: epoch 0: frames [0..2560] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+ nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
+
+Validating for node Err. 272 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 218]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 218]
+Validating --> feashift = RowSlice(features[363, MBSize 218]) -> [33, MBSize 218]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 218]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 218]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 218], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 218], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 218], LSTMoutput[1].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 218], LSTMoutput[1].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 218], LSTMoutput[1].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 218], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 218], LSTMoutput[2].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 218], LSTMoutput[2].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 218], LSTMoutput[2].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 218], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 218], LSTMoutput[3].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 218], LSTMoutput[3].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 218], LSTMoutput[3].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 218]) -> [132, MBSize 218]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 218], B[132, 1]) -> [132, MBSize 218]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 218], LSTMoutputW[132, MBSize 218]) -> [1, 1]
+
+Validating for node Err. 180 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 218]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 218]
+Validating --> feashift = RowSlice(features[363, MBSize 218]) -> [33, MBSize 218]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 218]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 218]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 218], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 218], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 218], LSTMoutput[1].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 218], LSTMoutput[1].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 218], LSTMoutput[1].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 218], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 218], LSTMoutput[2].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 218], LSTMoutput[2].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 218], LSTMoutput[2].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 218], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 218], LSTMoutput[3].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 218], LSTMoutput[3].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 218], LSTMoutput[3].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 218]) -> [132, MBSize 218]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 218], B[132, 1]) -> [132, MBSize 218]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 218], LSTMoutputW[132, MBSize 218]) -> [1, 1]
+
+Validating for node Err. 6 nodes to process in pass 3.
+
+Validating --> labels = InputValue -> [132, MBSize 218]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 218]
+Validating --> feashift = RowSlice(features[363, MBSize 218]) -> [33, MBSize 218]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 218]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 218]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 218], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 218], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 218], LSTMoutput[1].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 218], LSTMoutput[1].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 218], LSTMoutput[1].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 218], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 218], LSTMoutput[2].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 218], LSTMoutput[2].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 218], LSTMoutput[2].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 218], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 218], LSTMoutput[3].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 218], LSTMoutput[3].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 218], LSTMoutput[3].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 218]) -> [132, MBSize 218]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 218], B[132, 1]) -> [132, MBSize 218]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 218], LSTMoutputW[132, MBSize 218]) -> [1, 1]
+
+Validating for node Err, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 218]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 218]
+Validating --> feashift = RowSlice(features[363, MBSize 218]) -> [33, MBSize 218]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 218]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 218]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 218], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 218]) -> [33, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 218], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 218], LSTMoutput[1].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 218], LSTMoutput[1].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 218], LSTMoutput[1].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 218], LSTMoutput[1].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 218], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 218], LSTMoutput[2].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 218], LSTMoutput[2].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 218], LSTMoutput[2].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 218], LSTMoutput[2].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 218], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 218], LSTMoutput[3].it.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 218], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 218], LSTMoutput[3].bit./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 218], LSTMoutput[3].bit[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 218], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 218], LSTMoutput[3].mt./*.**/right[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 218]) -> [1024, MBSize 218]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 218]) -> [256, MBSize 218]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 218]) -> [132, MBSize 218]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 218], B[132, 1]) -> [132, MBSize 218]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 218], LSTMoutputW[132, MBSize 218]) -> [1, 1]
+
+127 out of 272 nodes do not share the minibatch layout with the input data.
+
 
 Starting minibatch loop.
- Epoch[ 1 of 2]-Minibatch[   1-   1 of 128]: SamplesSeen = 348; TrainLossPerSample =  4.88242489; EvalErr[0]PerSample = 0.99137931; TotalTime = 4.53694s; TotalTimePerSample = 13.03719ms; SamplesPerSecond = 76
- Epoch[ 1 of 2]-Minibatch[   2-   2 of 128]: SamplesSeen = 168; TrainLossPerSample =  4.61489796; EvalErr[0]PerSample = 0.93452381; TotalTime = 2.28522s; TotalTimePerSample = 13.60248ms; SamplesPerSecond = 73
- Epoch[ 1 of 2]-Minibatch[   3-   3 of 128]: SamplesSeen = 198; TrainLossPerSample =  4.45420884; EvalErr[0]PerSample = 0.87373737; TotalTime = 2.33478s; TotalTimePerSample = 11.79183ms; SamplesPerSecond = 84
- Epoch[ 1 of 2]-Minibatch[   4-   4 of 128]: SamplesSeen = 258; TrainLossPerSample =  4.39047076; EvalErr[0]PerSample = 0.83333333; TotalTime = 3.50424s; TotalTimePerSample = 13.58233ms; SamplesPerSecond = 73
- Epoch[ 1 of 2]-Minibatch[   5-   5 of 128]: SamplesSeen = 248; TrainLossPerSample = 57.29654817; EvalErr[0]PerSample = 0.92338710; TotalTime = 2.91642s; TotalTimePerSample = 11.75974ms; SamplesPerSecond = 85
- Epoch[ 1 of 2]-Minibatch[   6-   6 of 128]: SamplesSeen = 358; TrainLossPerSample =  5.41497905; EvalErr[0]PerSample = 0.91620112; TotalTime = 4.56634s; TotalTimePerSample = 12.75514ms; SamplesPerSecond = 78
- Epoch[ 1 of 2]-Minibatch[   7-   7 of 128]: SamplesSeen = 98; TrainLossPerSample =  4.44218351; EvalErr[0]PerSample = 0.88775510; TotalTime = 1.34473s; TotalTimePerSample = 13.72177ms; SamplesPerSecond = 72
- Epoch[ 1 of 2]-Minibatch[   8-   8 of 128]: SamplesSeen = 278; TrainLossPerSample =  4.20048336; EvalErr[0]PerSample = 0.77697842; TotalTime = 3.89413s; TotalTimePerSample = 14.00767ms; SamplesPerSecond = 71
- Epoch[ 1 of 2]-Minibatch[   9-   9 of 128]: SamplesSeen = 288; TrainLossPerSample =  4.66156684; EvalErr[0]PerSample = 0.92361111; TotalTime = 3.77711s; TotalTimePerSample = 13.11497ms; SamplesPerSecond = 76
- Epoch[ 1 of 2]-Minibatch[  10-  10 of 128]: SamplesSeen = 258; TrainLossPerSample =  4.35901920; EvalErr[0]PerSample = 0.86434109; TotalTime = 3.59676s; TotalTimePerSample = 13.94092ms; SamplesPerSecond = 71
+ Epoch[ 1 of 2]-Minibatch[   1-   1 of 128]: SamplesSeen = 348; TrainLossPerSample =  4.88242489; EvalErr[0]PerSample = 0.99137931; TotalTime = 4.44437s; TotalTimePerSample = 12.77117ms; SamplesPerSecond = 78
+ Epoch[ 1 of 2]-Minibatch[   2-   2 of 128]: SamplesSeen = 168; TrainLossPerSample =  4.61489796; EvalErr[0]PerSample = 0.93452381; TotalTime = 2.16304s; TotalTimePerSample = 12.87527ms; SamplesPerSecond = 77
+ Epoch[ 1 of 2]-Minibatch[   3-   3 of 128]: SamplesSeen = 198; TrainLossPerSample =  4.45420760; EvalErr[0]PerSample = 0.87373737; TotalTime = 2.53836s; TotalTimePerSample = 12.81998ms; SamplesPerSecond = 78
+ Epoch[ 1 of 2]-Minibatch[   4-   4 of 128]: SamplesSeen = 258; TrainLossPerSample =  4.39046981; EvalErr[0]PerSample = 0.83333333; TotalTime = 3.29028s; TotalTimePerSample = 12.75303ms; SamplesPerSecond = 78
+ Epoch[ 1 of 2]-Minibatch[   5-   5 of 128]: SamplesSeen = 248; TrainLossPerSample = 57.29652651; EvalErr[0]PerSample = 0.92338710; TotalTime = 3.15566s; TotalTimePerSample = 12.72446ms; SamplesPerSecond = 78
+ Epoch[ 1 of 2]-Minibatch[   6-   6 of 128]: SamplesSeen = 358; TrainLossPerSample =  5.41497905; EvalErr[0]PerSample = 0.91620112; TotalTime = 4.60661s; TotalTimePerSample = 12.86763ms; SamplesPerSecond = 77
+ Epoch[ 1 of 2]-Minibatch[   7-   7 of 128]: SamplesSeen = 98; TrainLossPerSample =  4.44218351; EvalErr[0]PerSample = 0.88775510; TotalTime = 1.29487s; TotalTimePerSample = 13.21293ms; SamplesPerSecond = 75
+ Epoch[ 1 of 2]-Minibatch[   8-   8 of 128]: SamplesSeen = 278; TrainLossPerSample =  4.20048336; EvalErr[0]PerSample = 0.77697842; TotalTime = 3.55463s; TotalTimePerSample = 12.78645ms; SamplesPerSecond = 78
+ Epoch[ 1 of 2]-Minibatch[   9-   9 of 128]: SamplesSeen = 288; TrainLossPerSample =  4.66156684; EvalErr[0]PerSample = 0.92361111; TotalTime = 3.67845s; TotalTimePerSample = 12.77239ms; SamplesPerSecond = 78
+ Epoch[ 1 of 2]-Minibatch[  10-  10 of 128]: SamplesSeen = 258; TrainLossPerSample =  4.35901920; EvalErr[0]PerSample = 0.86434109; TotalTime = 3.30382s; TotalTimePerSample = 12.80552ms; SamplesPerSecond = 78
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 1 of 2]-Minibatch[  11-  11 of 128]: SamplesSeen = 98; TrainLossPerSample =  3.70348374; EvalErr[0]PerSample = 0.67346939; TotalTime = 1.31943s; TotalTimePerSample = 13.46359ms; SamplesPerSecond = 74
-Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 9.6498594; EvalErrPerSample = 0.88722092; Ave LearnRatePerSample = 0.02500000037; EpochTime=34.105187
-Starting Epoch 2: learning rate per sample = 0.025000  momentum = 0.900000 
+ Epoch[ 1 of 2]-Minibatch[  11-  11 of 128]: SamplesSeen = 98; TrainLossPerSample =  3.70348374; EvalErr[0]PerSample = 0.67346939; TotalTime = 1.28901s; TotalTimePerSample = 13.15320ms; SamplesPerSecond = 76
+Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 9.6498575; EvalErrPerSample = 0.88722092; Ave LearnRatePerSample = 0.02500000037; EpochTime=33.380318
+Starting Epoch 2: learning rate per sample = 0.025000  effective momentum = 0.900000 
 minibatchiterator: epoch 1: frames [2560..5120] (first utterance at frame 2598), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 2]-Minibatch[   1-   1 of 128]: SamplesSeen = 138; TrainLossPerSample =  4.16283737; EvalErr[0]PerSample = 0.82608696; TotalTime = 1.62122s; TotalTimePerSample = 11.74799ms; SamplesPerSecond = 85
- Epoch[ 2 of 2]-Minibatch[   2-   2 of 128]: SamplesSeen = 318; TrainLossPerSample =  4.31444468; EvalErr[0]PerSample = 0.94654088; TotalTime = 3.71468s; TotalTimePerSample = 11.68139ms; SamplesPerSecond = 85
- Epoch[ 2 of 2]-Minibatch[   3-   3 of 128]: SamplesSeen = 368; TrainLossPerSample =  4.13243335; EvalErr[0]PerSample = 0.88586957; TotalTime = 4.31795s; TotalTimePerSample = 11.73355ms; SamplesPerSecond = 85
- Epoch[ 2 of 2]-Minibatch[   4-   4 of 128]: SamplesSeen = 98; TrainLossPerSample =  3.75084204; EvalErr[0]PerSample = 1.00000000; TotalTime = 1.16138s; TotalTimePerSample = 11.85083ms; SamplesPerSecond = 84
- Epoch[ 2 of 2]-Minibatch[   5-   5 of 128]: SamplesSeen = 308; TrainLossPerSample =  3.53811210; EvalErr[0]PerSample = 0.81168831; TotalTime = 3.57337s; TotalTimePerSample = 11.60187ms; SamplesPerSecond = 86
- Epoch[ 2 of 2]-Minibatch[   6-   6 of 128]: SamplesSeen = 258; TrainLossPerSample =  3.79568458; EvalErr[0]PerSample = 0.94186047; TotalTime = 3.02145s; TotalTimePerSample = 11.71103ms; SamplesPerSecond = 85
- Epoch[ 2 of 2]-Minibatch[   7-   7 of 128]: SamplesSeen = 238; TrainLossPerSample =  4.43607414; EvalErr[0]PerSample = 0.97058824; TotalTime = 2.94950s; TotalTimePerSample = 12.39287ms; SamplesPerSecond = 80
- Epoch[ 2 of 2]-Minibatch[   8-   8 of 128]: SamplesSeen = 268; TrainLossPerSample =  4.03240876; EvalErr[0]PerSample = 0.86194030; TotalTime = 3.38811s; TotalTimePerSample = 12.64221ms; SamplesPerSecond = 79
- Epoch[ 2 of 2]-Minibatch[   9-   9 of 128]: SamplesSeen = 308; TrainLossPerSample =  4.48105849; EvalErr[0]PerSample = 0.95779221; TotalTime = 3.88247s; TotalTimePerSample = 12.60544ms; SamplesPerSecond = 79
- Epoch[ 2 of 2]-Minibatch[  10-  10 of 128]: SamplesSeen = 288; TrainLossPerSample =  4.29093424; EvalErr[0]PerSample = 0.92708333; TotalTime = 3.66267s; TotalTimePerSample = 12.71761ms; SamplesPerSecond = 78
-Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 4.1143761; EvalErrPerSample = 0.90965247; Ave LearnRatePerSample = 0.02500000037; EpochTime=31.297491
+ Epoch[ 2 of 2]-Minibatch[   1-   1 of 128]: SamplesSeen = 138; TrainLossPerSample =  4.16283693; EvalErr[0]PerSample = 0.82608696; TotalTime = 1.79211s; TotalTimePerSample = 12.98633ms; SamplesPerSecond = 77
+ Epoch[ 2 of 2]-Minibatch[   2-   2 of 128]: SamplesSeen = 318; TrainLossPerSample =  4.31444449; EvalErr[0]PerSample = 0.94654088; TotalTime = 4.04166s; TotalTimePerSample = 12.70961ms; SamplesPerSecond = 78
+ Epoch[ 2 of 2]-Minibatch[   3-   3 of 128]: SamplesSeen = 368; TrainLossPerSample =  4.13243302; EvalErr[0]PerSample = 0.88586957; TotalTime = 4.73206s; TotalTimePerSample = 12.85885ms; SamplesPerSecond = 77
+ Epoch[ 2 of 2]-Minibatch[   4-   4 of 128]: SamplesSeen = 98; TrainLossPerSample =  3.75084204; EvalErr[0]PerSample = 1.00000000; TotalTime = 1.29247s; TotalTimePerSample = 13.18848ms; SamplesPerSecond = 75
+ Epoch[ 2 of 2]-Minibatch[   5-   5 of 128]: SamplesSeen = 308; TrainLossPerSample =  3.53811289; EvalErr[0]PerSample = 0.81168831; TotalTime = 3.92750s; TotalTimePerSample = 12.75161ms; SamplesPerSecond = 78
+ Epoch[ 2 of 2]-Minibatch[   6-   6 of 128]: SamplesSeen = 258; TrainLossPerSample =  3.79568458; EvalErr[0]PerSample = 0.94186047; TotalTime = 3.30113s; TotalTimePerSample = 12.79506ms; SamplesPerSecond = 78
+ Epoch[ 2 of 2]-Minibatch[   7-   7 of 128]: SamplesSeen = 238; TrainLossPerSample =  4.43607414; EvalErr[0]PerSample = 0.97058824; TotalTime = 3.05107s; TotalTimePerSample = 12.81964ms; SamplesPerSecond = 78
+ Epoch[ 2 of 2]-Minibatch[   8-   8 of 128]: SamplesSeen = 268; TrainLossPerSample =  4.03240876; EvalErr[0]PerSample = 0.86194030; TotalTime = 3.42811s; TotalTimePerSample = 12.79145ms; SamplesPerSecond = 78
+ Epoch[ 2 of 2]-Minibatch[   9-   9 of 128]: SamplesSeen = 308; TrainLossPerSample =  4.48105849; EvalErr[0]PerSample = 0.95779221; TotalTime = 3.92167s; TotalTimePerSample = 12.73270ms; SamplesPerSecond = 78
+ Epoch[ 2 of 2]-Minibatch[  10-  10 of 128]: SamplesSeen = 288; TrainLossPerSample =  4.29093424; EvalErr[0]PerSample = 0.92708333; TotalTime = 3.67384s; TotalTimePerSample = 12.75639ms; SamplesPerSecond = 78
+Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 4.1143761; EvalErrPerSample = 0.90965247; Ave LearnRatePerSample = 0.02500000037; EpochTime=33.170301
 CNTKCommandTrainEnd: speechTrain
 COMPLETED
diff --git a/Tests/Speech/LSTM/FullUtterance/baseline.windows.gpu.txt b/Tests/Speech/LSTM/FullUtterance/baseline.windows.gpu.txt
index b508e0391..a220aa8f7 100644
--- a/Tests/Speech/LSTM/FullUtterance/baseline.windows.gpu.txt
+++ b/Tests/Speech/LSTM/FullUtterance/baseline.windows.gpu.txt
@@ -1,16 +1,16 @@
-=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]]
+=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]]
 -------------------------------------------------------------------
 Build info: 
 
-		Built time: Sep 30 2015 17:18:44
-		Last modified date: Wed Sep 30 14:44:42 2015
+		Built time: Oct 24 2015 13:33:25
+		Last modified date: Thu Oct 22 16:00:27 2015
 		Built by amitaga on Amitaga-Win-DT3           
-		Build Path: E:\NetScale\CNTK\git_repos\cplx_master\MachineLearning\CNTK\
+		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
 -------------------------------------------------------------------
-running on Amitaga-Win-DT3 at 2015/10/01 21:35:39
-command line options: 
-configFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data DeviceId=0 NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] 
+running on Amitaga-Win-DT3 at 2015/10/24 21:51:44
+command line: 
+E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 Truncated=false speechTrain=[reader=[nbruttsineachrecurrentiter=1]] speechTrain=[SGD=[epochSize=2560]] speechTrain=[SGD=[maxEpochs=2]] speechTrain=[SGD=[numMBsToShowResult=1]] 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
@@ -24,9 +24,6 @@ speechTrain=[
     modelPath=$RunDir$/models/cntkSpeech.dnn
     deviceId=$DeviceId$
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -200,10 +197,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ]
-RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu
-DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM
 DeviceId=0
-NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM
 Truncated=false
 speechTrain=[reader=[nbruttsineachrecurrentiter=1]]
 speechTrain=[SGD=[epochSize=2560]]
@@ -221,12 +218,9 @@ frameMode=false
 Truncated=true
 speechTrain=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
     deviceId=0
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -246,11 +240,11 @@ speechTrain=[
       features=[
           dim=363
           type=Real
-          scpFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.scp
+          scpFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp
       ]
       labels=[
-          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf
-          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
+          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
           labelDim=132
           labelType=Category
       ]
@@ -400,10 +394,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ]
-RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu
-DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM
 DeviceId=0
-NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM
 Truncated=false
 speechTrain=[reader=[nbruttsineachrecurrentiter=1]]
 speechTrain=[SGD=[epochSize=2560]]
@@ -414,21 +408,18 @@ speechTrain=[SGD=[numMBsToShowResult=1]]
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: cntk.config:command=speechTrain
-configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data
+configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM
+configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
 configparameters: cntk.config:deviceId=0
 configparameters: cntk.config:frameMode=false
-configparameters: cntk.config:NDLDir=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu
+configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu
 configparameters: cntk.config:speechTrain=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
     deviceId=0
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\LSTM/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -448,11 +439,11 @@ configparameters: cntk.config:speechTrain=[
       features=[
           dim=363
           type=Real
-          scpFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.scp
+          scpFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp
       ]
       labels=[
-          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf
-          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
+          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
           labelDim=132
           labelType=Category
       ]
@@ -599,8 +590,7 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         Err = ErrorPrediction(labels, LSTMoutputW, tag='eval')              // this also gets tracked
         // decoding
         logPrior = LogPrior(labels)	 
-        ScaledLogLikelihood = Minus(LSTMoutputW, logPCNTKModelPath: C:\cygwin64\tmp\cntk-test-20151001133538.306652\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
-rior, tag='output')    // sadly we can't say x - y since we want to assign a tag
+        ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ] [reader=[nbruttsineachrecurrentiter=1]] [SGD=[epochSize=2560]] [SGD=[maxEpochs=2]] [SGD=[numMBsToShowResult=1]]
 
@@ -608,2129 +598,3389 @@ configparameters: cntk.config:Truncated=false
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: speechTrain 
 precision = float
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024135143.969217\Speech\LSTM_FullUtterance@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 2
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 2
 CNTKCommandTrainBegin: speechTrain
-NDLBuilder Using GPU 0
-reading script file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.scp ... 948 entries
+ExperimentalNetworkBuilder using GPU 0
+reading script file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp ... 948 entries
 trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
-total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/state.list
-htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
+htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
- nodes in the recurrent loops : 
-LSTMoutput1.unnamed174	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.bit	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.unnamed224	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.bit	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.unnamed274	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.bit	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.unnamed174	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.bit	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.unnamed224	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.bit	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.unnamed274	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.bit	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Printing Gradient Computation Node Order ... 
-
-cr[0, 0] = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[0, 0])
-LSTMoutputW[0, 0] = Plus(unnamed283[0, 0], b[132, 1])
-b[132, 1] = LearnableParameter
-unnamed283[0, 0] = Times(W[132, 256], unnamed284[0, 0])
-unnamed284[0, 0] = Scale(expsW[0, 0], LSTMoutput3.output[0, 0])
-LSTMoutput3.output[0, 0] = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[0, 0])
-LSTMoutput3.unnamed275[0, 0] = Scale(LSTMoutput3.expsWmr[0, 0], LSTMoutput3.mt[0, 0])
-LSTMoutput3.mt[0, 0] = ElementTimes(LSTMoutput3.ot[0, 0], LSTMoutput3.unnamed274[0, 0])
-LSTMoutput3.unnamed274[0, 0] = Tanh(LSTMoutput3.ct[0, 0])
-LSTMoutput3.ot[0, 0] = Sigmoid(LSTMoutput3.unnamed271[0, 0])
-LSTMoutput3.unnamed271[0, 0] = Plus(LSTMoutput3.unnamed272[0, 0], LSTMoutput3.Wcoct[0, 0])
-LSTMoutput3.Wcoct[0, 0] = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[0, 0])
-LSTMoutput3.unnamed270[0, 0] = Scale(LSTMoutput3.expsWco[0, 0], LSTMoutput3.ct[0, 0])
-LSTMoutput3.ct[0, 0] = Plus(LSTMoutput3.bft[0, 0], LSTMoutput3.bit[0, 0])
-LSTMoutput3.bit[0, 0] = ElementTimes(LSTMoutput3.it[0, 0], LSTMoutput3.unnamed259[0, 0])
-LSTMoutput3.unnamed259[0, 0] = Tanh(LSTMoutput3.unnamed260[0, 0])
-LSTMoutput3.unnamed260[0, 0] = Plus(LSTMoutput3.Wxcx[0, 0], LSTMoutput3.unnamed261[0, 0])
-LSTMoutput3.unnamed261[0, 0] = Plus(LSTMoutput3.Whcdh[0, 0], LSTMoutput3.bc[1024, 1])
-LSTMoutput3.Whcdh[0, 0] = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[0, 0])
-LSTMoutput3.unnamed258[0, 0] = Scale(LSTMoutput3.expsWhc[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.it[0, 0] = Sigmoid(LSTMoutput3.unnamed254[0, 0])
-LSTMoutput3.unnamed254[0, 0] = Plus(LSTMoutput3.unnamed255[0, 0], LSTMoutput3.Wcidc[0, 0])
-LSTMoutput3.Wcidc[0, 0] = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[0, 0])
-LSTMoutput3.unnamed253[0, 0] = Scale(LSTMoutput3.expsWci[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.unnamed255[0, 0] = Plus(LSTMoutput3.unnamed256[0, 0], LSTMoutput3.Whidh[0, 0])
-LSTMoutput3.Whidh[0, 0] = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[0, 0])
-LSTMoutput3.unnamed252[0, 0] = Scale(LSTMoutput3.expsWhi[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.bft[0, 0] = ElementTimes(LSTMoutput3.ft[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.ft[0, 0] = Sigmoid(LSTMoutput3.unnamed265[0, 0])
-LSTMoutput3.unnamed265[0, 0] = Plus(LSTMoutput3.unnamed266[0, 0], LSTMoutput3.Wcfdc[0, 0])
-LSTMoutput3.Wcfdc[0, 0] = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[0, 0])
-LSTMoutput3.unnamed264[0, 0] = Scale(LSTMoutput3.expsWcf[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.dc[1024, 1] = PastValue(LSTMoutput3.ct[0, 0])
-LSTMoutput3.unnamed266[0, 0] = Plus(LSTMoutput3.unnamed267[0, 0], LSTMoutput3.Whfdh[0, 0])
-LSTMoutput3.Whfdh[0, 0] = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[0, 0])
-LSTMoutput3.unnamed263[0, 0] = Scale(LSTMoutput3.expsWhf[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.unnamed272[0, 0] = Plus(LSTMoutput3.unnamed273[0, 0], LSTMoutput3.Whodh[0, 0])
-LSTMoutput3.Whodh[0, 0] = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[0, 0])
-LSTMoutput3.unnamed269[0, 0] = Scale(LSTMoutput3.expsWho[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.dh[256, 1] = PastValue(LSTMoutput3.output[0, 0])
-LSTMoutput3.bc[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhc[0, 0] = Exp(LSTMoutput3.sWhc[1, 1])
-LSTMoutput3.sWhc[1, 1] = LearnableParameter
-LSTMoutput3.Whc[1024, 256] = LearnableParameter
-LSTMoutput3.Wxcx[0, 0] = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[0, 0])
-LSTMoutput3.unnamed257[0, 0] = Scale(LSTMoutput3.expsWxc[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxc[0, 0] = Exp(LSTMoutput3.sWxc[1, 1])
-LSTMoutput3.sWxc[1, 1] = LearnableParameter
-LSTMoutput3.Wxc[1024, 256] = LearnableParameter
-LSTMoutput3.expsWci[0, 0] = Exp(LSTMoutput3.sWci[1, 1])
-LSTMoutput3.sWci[1, 1] = LearnableParameter
-LSTMoutput3.Wci[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhi[0, 0] = Exp(LSTMoutput3.sWhi[1, 1])
-LSTMoutput3.sWhi[1, 1] = LearnableParameter
-LSTMoutput3.Whi[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed256[0, 0] = Plus(LSTMoutput3.Wxix[0, 0], LSTMoutput3.bi[1024, 1])
-LSTMoutput3.bi[1024, 1] = LearnableParameter
-LSTMoutput3.Wxix[0, 0] = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[0, 0])
-LSTMoutput3.unnamed251[0, 0] = Scale(LSTMoutput3.expsWxi[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxi[0, 0] = Exp(LSTMoutput3.sWxi[1, 1])
-LSTMoutput3.sWxi[1, 1] = LearnableParameter
-LSTMoutput3.Wxi[1024, 256] = LearnableParameter
-LSTMoutput3.expsWcf[0, 0] = Exp(LSTMoutput3.sWcf[1, 1])
-LSTMoutput3.sWcf[1, 1] = LearnableParameter
-LSTMoutput3.Wcf[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhf[0, 0] = Exp(LSTMoutput3.sWhf[1, 1])
-LSTMoutput3.sWhf[1, 1] = LearnableParameter
-LSTMoutput3.Whf[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed267[0, 0] = Plus(LSTMoutput3.Wxfx[0, 0], LSTMoutput3.bf[1024, 1])
-LSTMoutput3.bf[1024, 1] = LearnableParameter
-LSTMoutput3.Wxfx[0, 0] = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[0, 0])
-LSTMoutput3.unnamed262[0, 0] = Scale(LSTMoutput3.expsWxf[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxf[0, 0] = Exp(LSTMoutput3.sWxf[1, 1])
-LSTMoutput3.sWxf[1, 1] = LearnableParameter
-LSTMoutput3.Wxf[1024, 256] = LearnableParameter
-LSTMoutput3.expsWco[0, 0] = Exp(LSTMoutput3.sWco[1, 1])
-LSTMoutput3.sWco[1, 1] = LearnableParameter
-LSTMoutput3.Wco[1024, 1] = LearnableParameter
-LSTMoutput3.expsWho[0, 0] = Exp(LSTMoutput3.sWho[1, 1])
-LSTMoutput3.sWho[1, 1] = LearnableParameter
-LSTMoutput3.Who[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed273[0, 0] = Plus(LSTMoutput3.Wxox[0, 0], LSTMoutput3.bo[1024, 1])
-LSTMoutput3.bo[1024, 1] = LearnableParameter
-LSTMoutput3.Wxox[0, 0] = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[0, 0])
-LSTMoutput3.unnamed268[0, 0] = Scale(LSTMoutput3.expsWxo[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput2.output[0, 0] = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[0, 0])
-LSTMoutput2.unnamed225[0, 0] = Scale(LSTMoutput2.expsWmr[0, 0], LSTMoutput2.mt[0, 0])
-LSTMoutput2.mt[0, 0] = ElementTimes(LSTMoutput2.ot[0, 0], LSTMoutput2.unnamed224[0, 0])
-LSTMoutput2.unnamed224[0, 0] = Tanh(LSTMoutput2.ct[0, 0])
-LSTMoutput2.ot[0, 0] = Sigmoid(LSTMoutput2.unnamed221[0, 0])
-LSTMoutput2.unnamed221[0, 0] = Plus(LSTMoutput2.unnamed222[0, 0], LSTMoutput2.Wcoct[0, 0])
-LSTMoutput2.Wcoct[0, 0] = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[0, 0])
-LSTMoutput2.unnamed220[0, 0] = Scale(LSTMoutput2.expsWco[0, 0], LSTMoutput2.ct[0, 0])
-LSTMoutput2.ct[0, 0] = Plus(LSTMoutput2.bft[0, 0], LSTMoutput2.bit[0, 0])
-LSTMoutput2.bit[0, 0] = ElementTimes(LSTMoutput2.it[0, 0], LSTMoutput2.unnamed209[0, 0])
-LSTMoutput2.unnamed209[0, 0] = Tanh(LSTMoutput2.unnamed210[0, 0])
-LSTMoutput2.unnamed210[0, 0] = Plus(LSTMoutput2.Wxcx[0, 0], LSTMoutput2.unnamed211[0, 0])
-LSTMoutput2.unnamed211[0, 0] = Plus(LSTMoutput2.Whcdh[0, 0], LSTMoutput2.bc[1024, 1])
-LSTMoutput2.Whcdh[0, 0] = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[0, 0])
-LSTMoutput2.unnamed208[0, 0] = Scale(LSTMoutput2.expsWhc[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.it[0, 0] = Sigmoid(LSTMoutput2.unnamed204[0, 0])
-LSTMoutput2.unnamed204[0, 0] = Plus(LSTMoutput2.unnamed205[0, 0], LSTMoutput2.Wcidc[0, 0])
-LSTMoutput2.Wcidc[0, 0] = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[0, 0])
-LSTMoutput2.unnamed203[0, 0] = Scale(LSTMoutput2.expsWci[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.unnamed205[0, 0] = Plus(LSTMoutput2.unnamed206[0, 0], LSTMoutput2.Whidh[0, 0])
-LSTMoutput2.Whidh[0, 0] = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[0, 0])
-LSTMoutput2.unnamed202[0, 0] = Scale(LSTMoutput2.expsWhi[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.bft[0, 0] = ElementTimes(LSTMoutput2.ft[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.ft[0, 0] = Sigmoid(LSTMoutput2.unnamed215[0, 0])
-LSTMoutput2.unnamed215[0, 0] = Plus(LSTMoutput2.unnamed216[0, 0], LSTMoutput2.Wcfdc[0, 0])
-LSTMoutput2.Wcfdc[0, 0] = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[0, 0])
-LSTMoutput2.unnamed214[0, 0] = Scale(LSTMoutput2.expsWcf[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.dc[1024, 1] = PastValue(LSTMoutput2.ct[0, 0])
-LSTMoutput2.unnamed216[0, 0] = Plus(LSTMoutput2.unnamed217[0, 0], LSTMoutput2.Whfdh[0, 0])
-LSTMoutput2.Whfdh[0, 0] = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[0, 0])
-LSTMoutput2.unnamed213[0, 0] = Scale(LSTMoutput2.expsWhf[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.unnamed222[0, 0] = Plus(LSTMoutput2.unnamed223[0, 0], LSTMoutput2.Whodh[0, 0])
-LSTMoutput2.Whodh[0, 0] = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[0, 0])
-LSTMoutput2.unnamed219[0, 0] = Scale(LSTMoutput2.expsWho[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.dh[256, 1] = PastValue(LSTMoutput2.output[0, 0])
-LSTMoutput2.bc[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhc[0, 0] = Exp(LSTMoutput2.sWhc[1, 1])
-LSTMoutput2.sWhc[1, 1] = LearnableParameter
-LSTMoutput2.Whc[1024, 256] = LearnableParameter
-LSTMoutput2.Wxcx[0, 0] = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[0, 0])
-LSTMoutput2.unnamed207[0, 0] = Scale(LSTMoutput2.expsWxc[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxc[0, 0] = Exp(LSTMoutput2.sWxc[1, 1])
-LSTMoutput2.sWxc[1, 1] = LearnableParameter
-LSTMoutput2.Wxc[1024, 256] = LearnableParameter
-LSTMoutput2.expsWci[0, 0] = Exp(LSTMoutput2.sWci[1, 1])
-LSTMoutput2.sWci[1, 1] = LearnableParameter
-LSTMoutput2.Wci[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhi[0, 0] = Exp(LSTMoutput2.sWhi[1, 1])
-LSTMoutput2.sWhi[1, 1] = LearnableParameter
-LSTMoutput2.Whi[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed206[0, 0] = Plus(LSTMoutput2.Wxix[0, 0], LSTMoutput2.bi[1024, 1])
-LSTMoutput2.bi[1024, 1] = LearnableParameter
-LSTMoutput2.Wxix[0, 0] = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[0, 0])
-LSTMoutput2.unnamed201[0, 0] = Scale(LSTMoutput2.expsWxi[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxi[0, 0] = Exp(LSTMoutput2.sWxi[1, 1])
-LSTMoutput2.sWxi[1, 1] = LearnableParameter
-LSTMoutput2.Wxi[1024, 256] = LearnableParameter
-LSTMoutput2.expsWcf[0, 0] = Exp(LSTMoutput2.sWcf[1, 1])
-LSTMoutput2.sWcf[1, 1] = LearnableParameter
-LSTMoutput2.Wcf[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhf[0, 0] = Exp(LSTMoutput2.sWhf[1, 1])
-LSTMoutput2.sWhf[1, 1] = LearnableParameter
-LSTMoutput2.Whf[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed217[0, 0] = Plus(LSTMoutput2.Wxfx[0, 0], LSTMoutput2.bf[1024, 1])
-LSTMoutput2.bf[1024, 1] = LearnableParameter
-LSTMoutput2.Wxfx[0, 0] = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[0, 0])
-LSTMoutput2.unnamed212[0, 0] = Scale(LSTMoutput2.expsWxf[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxf[0, 0] = Exp(LSTMoutput2.sWxf[1, 1])
-LSTMoutput2.sWxf[1, 1] = LearnableParameter
-LSTMoutput2.Wxf[1024, 256] = LearnableParameter
-LSTMoutput2.expsWco[0, 0] = Exp(LSTMoutput2.sWco[1, 1])
-LSTMoutput2.sWco[1, 1] = LearnableParameter
-LSTMoutput2.Wco[1024, 1] = LearnableParameter
-LSTMoutput2.expsWho[0, 0] = Exp(LSTMoutput2.sWho[1, 1])
-LSTMoutput2.sWho[1, 1] = LearnableParameter
-LSTMoutput2.Who[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed223[0, 0] = Plus(LSTMoutput2.Wxox[0, 0], LSTMoutput2.bo[1024, 1])
-LSTMoutput2.bo[1024, 1] = LearnableParameter
-LSTMoutput2.Wxox[0, 0] = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[0, 0])
-LSTMoutput2.unnamed218[0, 0] = Scale(LSTMoutput2.expsWxo[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput1.output[0, 0] = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[0, 0])
-LSTMoutput1.unnamed175[0, 0] = Scale(LSTMoutput1.expsWmr[0, 0], LSTMoutput1.mt[0, 0])
-LSTMoutput1.mt[0, 0] = ElementTimes(LSTMoutput1.ot[0, 0], LSTMoutput1.unnamed174[0, 0])
-LSTMoutput1.unnamed174[0, 0] = Tanh(LSTMoutput1.ct[0, 0])
-LSTMoutput1.ot[0, 0] = Sigmoid(LSTMoutput1.unnamed171[0, 0])
-LSTMoutput1.unnamed171[0, 0] = Plus(LSTMoutput1.unnamed172[0, 0], LSTMoutput1.Wcoct[0, 0])
-LSTMoutput1.Wcoct[0, 0] = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[0, 0])
-LSTMoutput1.unnamed170[0, 0] = Scale(LSTMoutput1.expsWco[0, 0], LSTMoutput1.ct[0, 0])
-LSTMoutput1.ct[0, 0] = Plus(LSTMoutput1.bft[0, 0], LSTMoutput1.bit[0, 0])
-LSTMoutput1.bit[0, 0] = ElementTimes(LSTMoutput1.it[0, 0], LSTMoutput1.unnamed159[0, 0])
-LSTMoutput1.unnamed159[0, 0] = Tanh(LSTMoutput1.unnamed160[0, 0])
-LSTMoutput1.unnamed160[0, 0] = Plus(LSTMoutput1.Wxcx[0, 0], LSTMoutput1.unnamed161[0, 0])
-LSTMoutput1.unnamed161[0, 0] = Plus(LSTMoutput1.Whcdh[0, 0], LSTMoutput1.bc[1024, 1])
-LSTMoutput1.Whcdh[0, 0] = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[0, 0])
-LSTMoutput1.unnamed158[0, 0] = Scale(LSTMoutput1.expsWhc[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.it[0, 0] = Sigmoid(LSTMoutput1.unnamed154[0, 0])
-LSTMoutput1.unnamed154[0, 0] = Plus(LSTMoutput1.unnamed155[0, 0], LSTMoutput1.Wcidc[0, 0])
-LSTMoutput1.Wcidc[0, 0] = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[0, 0])
-LSTMoutput1.unnamed153[0, 0] = Scale(LSTMoutput1.expsWci[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.unnamed155[0, 0] = Plus(LSTMoutput1.unnamed156[0, 0], LSTMoutput1.Whidh[0, 0])
-LSTMoutput1.Whidh[0, 0] = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[0, 0])
-LSTMoutput1.unnamed152[0, 0] = Scale(LSTMoutput1.expsWhi[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.bft[0, 0] = ElementTimes(LSTMoutput1.ft[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.ft[0, 0] = Sigmoid(LSTMoutput1.unnamed165[0, 0])
-LSTMoutput1.unnamed165[0, 0] = Plus(LSTMoutput1.unnamed166[0, 0], LSTMoutput1.Wcfdc[0, 0])
-LSTMoutput1.Wcfdc[0, 0] = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[0, 0])
-LSTMoutput1.unnamed164[0, 0] = Scale(LSTMoutput1.expsWcf[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.dc[1024, 1] = PastValue(LSTMoutput1.ct[0, 0])
-LSTMoutput1.unnamed166[0, 0] = Plus(LSTMoutput1.unnamed167[0, 0], LSTMoutput1.Whfdh[0, 0])
-LSTMoutput1.Whfdh[0, 0] = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[0, 0])
-LSTMoutput1.unnamed163[0, 0] = Scale(LSTMoutput1.expsWhf[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.unnamed172[0, 0] = Plus(LSTMoutput1.unnamed173[0, 0], LSTMoutput1.Whodh[0, 0])
-LSTMoutput1.Whodh[0, 0] = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[0, 0])
-LSTMoutput1.unnamed169[0, 0] = Scale(LSTMoutput1.expsWho[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.dh[256, 1] = PastValue(LSTMoutput1.output[0, 0])
-LSTMoutput1.bc[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhc[0, 0] = Exp(LSTMoutput1.sWhc[1, 1])
-LSTMoutput1.sWhc[1, 1] = LearnableParameter
-LSTMoutput1.Whc[1024, 256] = LearnableParameter
-LSTMoutput1.Wxcx[0, 0] = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[0, 0])
-LSTMoutput1.unnamed157[0, 0] = Scale(LSTMoutput1.expsWxc[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxc[0, 0] = Exp(LSTMoutput1.sWxc[1, 1])
-LSTMoutput1.sWxc[1, 1] = LearnableParameter
-LSTMoutput1.Wxc[1024, 33] = LearnableParameter
-LSTMoutput1.expsWci[0, 0] = Exp(LSTMoutput1.sWci[1, 1])
-LSTMoutput1.sWci[1, 1] = LearnableParameter
-LSTMoutput1.Wci[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhi[0, 0] = Exp(LSTMoutput1.sWhi[1, 1])
-LSTMoutput1.sWhi[1, 1] = LearnableParameter
-LSTMoutput1.Whi[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed156[0, 0] = Plus(LSTMoutput1.Wxix[0, 0], LSTMoutput1.bi[1024, 1])
-LSTMoutput1.bi[1024, 1] = LearnableParameter
-LSTMoutput1.Wxix[0, 0] = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[0, 0])
-LSTMoutput1.unnamed151[0, 0] = Scale(LSTMoutput1.expsWxi[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxi[0, 0] = Exp(LSTMoutput1.sWxi[1, 1])
-LSTMoutput1.sWxi[1, 1] = LearnableParameter
-LSTMoutput1.Wxi[1024, 33] = LearnableParameter
-LSTMoutput1.expsWcf[0, 0] = Exp(LSTMoutput1.sWcf[1, 1])
-LSTMoutput1.sWcf[1, 1] = LearnableParameter
-LSTMoutput1.Wcf[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhf[0, 0] = Exp(LSTMoutput1.sWhf[1, 1])
-LSTMoutput1.sWhf[1, 1] = LearnableParameter
-LSTMoutput1.Whf[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed167[0, 0] = Plus(LSTMoutput1.Wxfx[0, 0], LSTMoutput1.bf[1024, 1])
-LSTMoutput1.bf[1024, 1] = LearnableParameter
-LSTMoutput1.Wxfx[0, 0] = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[0, 0])
-LSTMoutput1.unnamed162[0, 0] = Scale(LSTMoutput1.expsWxf[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxf[0, 0] = Exp(LSTMoutput1.sWxf[1, 1])
-LSTMoutput1.sWxf[1, 1] = LearnableParameter
-LSTMoutput1.Wxf[1024, 33] = LearnableParameter
-LSTMoutput1.expsWco[0, 0] = Exp(LSTMoutput1.sWco[1, 1])
-LSTMoutput1.sWco[1, 1] = LearnableParameter
-LSTMoutput1.Wco[1024, 1] = LearnableParameter
-LSTMoutput1.expsWho[0, 0] = Exp(LSTMoutput1.sWho[1, 1])
-LSTMoutput1.sWho[1, 1] = LearnableParameter
-LSTMoutput1.Who[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed173[0, 0] = Plus(LSTMoutput1.Wxox[0, 0], LSTMoutput1.bo[1024, 1])
-LSTMoutput1.bo[1024, 1] = LearnableParameter
-LSTMoutput1.Wxox[0, 0] = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[0, 0])
-LSTMoutput1.unnamed168[0, 0] = Scale(LSTMoutput1.expsWxo[0, 0], featNorm.xNorm[0, 0])
-featNorm.xNorm[0, 0] = PerDimMeanVarNormalization(feashift[0, 0], featNorm.xMean[0, 0], featNorm.xStdDev[0, 0])
-featNorm.xStdDev[0, 0] = InvStdDev(feashift[0, 0])
-featNorm.xMean[0, 0] = Mean(feashift[0, 0])
-feashift[0, 0] = RowSlice(features[363, 1])
-features[363, 1] = InputValue
-LSTMoutput1.expsWxo[0, 0] = Exp(LSTMoutput1.sWxo[1, 1])
-LSTMoutput1.sWxo[1, 1] = LearnableParameter
-LSTMoutput1.Wxo[1024, 33] = LearnableParameter
-LSTMoutput1.expsWmr[0, 0] = Exp(LSTMoutput1.sWmr[1, 1])
-LSTMoutput1.sWmr[1, 1] = LearnableParameter
-LSTMoutput1.Wmr[256, 1024] = LearnableParameter
-LSTMoutput2.expsWxo[0, 0] = Exp(LSTMoutput2.sWxo[1, 1])
-LSTMoutput2.sWxo[1, 1] = LearnableParameter
-LSTMoutput2.Wxo[1024, 256] = LearnableParameter
-LSTMoutput2.expsWmr[0, 0] = Exp(LSTMoutput2.sWmr[1, 1])
-LSTMoutput2.sWmr[1, 1] = LearnableParameter
-LSTMoutput2.Wmr[256, 1024] = LearnableParameter
-LSTMoutput3.expsWxo[0, 0] = Exp(LSTMoutput3.sWxo[1, 1])
-LSTMoutput3.sWxo[1, 1] = LearnableParameter
-LSTMoutput3.Wxo[1024, 256] = LearnableParameter
-LSTMoutput3.expsWmr[0, 0] = Exp(LSTMoutput3.sWmr[1, 1])
-LSTMoutput3.sWmr[1, 1] = LearnableParameter
-LSTMoutput3.Wmr[256, 1024] = LearnableParameter
-expsW[0, 0] = Exp(sW[1, 1])
-sW[1, 1] = LearnableParameter
-W[132, 256] = LearnableParameter
-labels[132, 1] = InputValue
-
-Validating node cr 
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node cr 
-
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1]) -> [1024, 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1]) -> [1024, 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [256, 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256 {W=3452816845, H=3452816845, C=3452816845}, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1]) -> [1024, 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-127 out of 272 nodes do not share the minibatch layout with the input data.
-
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-127 out of 272 nodes do not share the minibatch layout with the input data.
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node ScaledLogLikelihood 
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node ScaledLogLikelihood 
-
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> logPrior.Prior = Mean(labels[132, MBSize 1]) -> [132, 1]
-Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -> [132, 1]
-Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, MBSize 1], logPrior.LogPrior[132, 1]) -> [132, MBSize 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-128 out of 274 nodes do not share the minibatch layout with the input data.
-
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> logPrior.Prior = Mean(labels[132, MBSize 1]) -> [132, 1]
-Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1]) -> [132, 1]
-Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, MBSize 1], logPrior.LogPrior[132, 1]) -> [132, MBSize 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-128 out of 274 nodes do not share the minibatch layout with the input data.
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node Err 
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node Err 
-
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-127 out of 272 nodes do not share the minibatch layout with the input data.
-
-Validating --> labels = InputValue -> [132, MBSize 1]
-Validating --> W = LearnableParameter -> [132, 256]
-Validating --> sW = LearnableParameter -> [1, 1]
-Validating --> expsW = Exp(sW[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput3.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput2.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxo = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wmr = LearnableParameter -> [256, 1024]
-Validating --> LSTMoutput1.sWmr = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxo = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxo = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1]) -> [1, 1]
-Validating --> features = InputValue -> [363, MBSize 1]
-Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, MBSize 1], LSTMoutput1.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxf = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, MBSize 1], LSTMoutput1.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxi = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, MBSize 1], LSTMoutput1.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.Wxc = LearnableParameter -> [1024, 33]
-Validating --> LSTMoutput1.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, MBSize 1]) -> [33, MBSize 1]
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput1.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput1.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, MBSize 1], LSTMoutput1.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, MBSize 1], LSTMoutput1.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, MBSize 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, MBSize 1], LSTMoutput1.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, MBSize 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, MBSize 1], LSTMoutput1.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, MBSize 1], LSTMoutput1.unnamed161[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed159[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, MBSize 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, MBSize 1], LSTMoutput2.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, MBSize 1], LSTMoutput2.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, MBSize 1], LSTMoutput2.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput2.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput2.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, MBSize 1], LSTMoutput2.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, MBSize 1], LSTMoutput2.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, MBSize 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, MBSize 1], LSTMoutput2.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, MBSize 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, MBSize 1], LSTMoutput2.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, MBSize 1], LSTMoutput2.unnamed211[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed209[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, MBSize 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bo = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, MBSize 1], LSTMoutput3.bo[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Who = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWho = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wco = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWco = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, MBSize 1], LSTMoutput3.bf[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whf = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wcf = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWcf = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bi = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, MBSize 1], LSTMoutput3.bi[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whi = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhi = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wci = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.sWci = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.Wxc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWxc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Whc = LearnableParameter -> [1024, 256]
-Validating --> LSTMoutput3.sWhc = LearnableParameter -> [1, 1]
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1]) -> [1, 1]
-Validating --> LSTMoutput3.bc = LearnableParameter -> [1024, 1]
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, MBSize 1], LSTMoutput3.Whodh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, MBSize 1], LSTMoutput3.Whfdh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, MBSize 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, MBSize 1], LSTMoutput3.Whidh[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, MBSize 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, MBSize 1], LSTMoutput3.bc[1024, 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, MBSize 1], LSTMoutput3.unnamed261[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed259[1024, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, MBSize 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, MBSize 1]) -> [132, MBSize 1]
-Validating --> b = LearnableParameter -> [132, 1]
-Validating --> LSTMoutputW = Plus(unnamed283[132, MBSize 1], b[132, 1]) -> [132, MBSize 1]
-Validating --> Err = ErrorPrediction(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
----
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, MBSize 1]) -> [256, MBSize 1]
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, MBSize 1]) -> [1024, MBSize 1]
-
-Revalidating
-
-
-
-
-
-127 out of 272 nodes do not share the minibatch layout with the input data.
+Node --> B = LearnableParameter
+Node --> labels = InputValue
+Node --> LSTMoutputW./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].Wmr = LearnableParameter
+Node --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].Wmr = LearnableParameter
+Node --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].Wmr = LearnableParameter
+Node --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> features = InputValue
+Node --> feashift = RowSlice
+Node --> featNorm.meanVector = Mean
+Node --> featNorm.invStdDevVector = InvStdDev
+Node --> featNorm = PerDimMeanVarNormalization
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].dh = PastValue
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].ot.z./*+*/left = Plus
+Node --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].ft.z./*+*/left = Plus
+Node --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].dc = PastValue
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].ft.z = Plus
+Node --> LSTMoutput[1].ft = Sigmoid
+Node --> LSTMoutput[1].bft = ElementTimes
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].it.z./*+*/left = Plus
+Node --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].it.z = Plus
+Node --> LSTMoutput[1].it = Sigmoid
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[1].bit./*.**/right.z = Plus
+Node --> LSTMoutput[1].bit./*.**/right = Tanh
+Node --> LSTMoutput[1].bit = ElementTimes
+Node --> LSTMoutput[1].ct = Plus
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].ot.z = Plus
+Node --> LSTMoutput[1].ot = Sigmoid
+Node --> LSTMoutput[1].mt./*.**/right = Tanh
+Node --> LSTMoutput[1].mt = ElementTimes
+Node --> LSTMoutput[1].output./***/right = Scale
+Node --> LSTMoutput[1].output = Times
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].dh = PastValue
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].ot.z./*+*/left = Plus
+Node --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].ft.z./*+*/left = Plus
+Node --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].dc = PastValue
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].ft.z = Plus
+Node --> LSTMoutput[2].ft = Sigmoid
+Node --> LSTMoutput[2].bft = ElementTimes
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].it.z./*+*/left = Plus
+Node --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].it.z = Plus
+Node --> LSTMoutput[2].it = Sigmoid
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[2].bit./*.**/right.z = Plus
+Node --> LSTMoutput[2].bit./*.**/right = Tanh
+Node --> LSTMoutput[2].bit = ElementTimes
+Node --> LSTMoutput[2].ct = Plus
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].ot.z = Plus
+Node --> LSTMoutput[2].ot = Sigmoid
+Node --> LSTMoutput[2].mt./*.**/right = Tanh
+Node --> LSTMoutput[2].mt = ElementTimes
+Node --> LSTMoutput[2].output./***/right = Scale
+Node --> LSTMoutput[2].output = Times
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].dh = PastValue
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].ot.z./*+*/left = Plus
+Node --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].ft.z./*+*/left = Plus
+Node --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].dc = PastValue
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].ft.z = Plus
+Node --> LSTMoutput[3].ft = Sigmoid
+Node --> LSTMoutput[3].bft = ElementTimes
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].it.z./*+*/left = Plus
+Node --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].it.z = Plus
+Node --> LSTMoutput[3].it = Sigmoid
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[3].bit./*.**/right.z = Plus
+Node --> LSTMoutput[3].bit./*.**/right = Tanh
+Node --> LSTMoutput[3].bit = ElementTimes
+Node --> LSTMoutput[3].ct = Plus
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].ot.z = Plus
+Node --> LSTMoutput[3].ot = Sigmoid
+Node --> LSTMoutput[3].mt./*.**/right = Tanh
+Node --> LSTMoutput[3].mt = ElementTimes
+Node --> LSTMoutput[3].output./***/right = Scale
+Node --> LSTMoutput[3].output = Times
+Node --> LSTMoutputW./*+*/left./***/right = Scale
+Node --> LSTMoutputW./*+*/left = Times
+Node --> LSTMoutputW = Plus
+Node --> Err = ErrorPrediction
+Node --> logPrior.x = Mean
+Node --> logPrior = Log
+Node --> ScaledLogLikelihood = Minus
+Node --> cr = CrossEntropyWithSoftmax
+class Microsoft::MSR::CNTK::ComputationNetwork [
+  B : LearnableParameter 132 x 1 ()
+  cr : CrossEntropyWithSoftmax 0 x 0 (
+    labels
+    LSTMoutputW
+  )
+  Err : ErrorPrediction 0 x 0 (
+    labels
+    LSTMoutputW
+  )
+  feashift : RowSlice 0 x 0 (
+    features
+  )
+  featNorm : PerDimMeanVarNormalization 0 x 0 (
+    feashift
+    featNorm.meanVector
+    featNorm.invStdDevVector
+  )
+  featNorm.invStdDevVector : InvStdDev 0 x 0 (
+    feashift
+  )
+  featNorm.meanVector : Mean 0 x 0 (
+    feashift
+  )
+  features : InputValue 363 x 1 ()
+  labels : InputValue 132 x 1 ()
+  logPrior : Log 0 x 0 (
+    logPrior.x
+  )
+  logPrior.x : Mean 0 x 0 (
+    labels
+  )
+  LSTMoutput[1].bft : ElementTimes 0 x 0 (
+    LSTMoutput[1].ft
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].bit : ElementTimes 0 x 0 (
+    LSTMoutput[1].it
+    LSTMoutput[1].bit./*.**/right
+  )
+  LSTMoutput[1].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z
+  )
+  LSTMoutput[1].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ct : Plus 0 x 0 (
+    LSTMoutput[1].bft
+    LSTMoutput[1].bit
+  )
+  LSTMoutput[1].dc : PastValue 1024 x 1 (
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].dh : PastValue 256 x 1 (
+    LSTMoutput[1].output
+  )
+  LSTMoutput[1].ft : Sigmoid 0 x 0 (
+    LSTMoutput[1].ft.z
+  )
+  LSTMoutput[1].ft.z : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left
+    LSTMoutput[1].ft.z./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it : Sigmoid 0 x 0 (
+    LSTMoutput[1].it.z
+  )
+  LSTMoutput[1].it.z : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left
+    LSTMoutput[1].it.z./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left
+    LSTMoutput[1].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].it.z./*+*/right.matrix
+  )
+  LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].mt : ElementTimes 0 x 0 (
+    LSTMoutput[1].ot
+    LSTMoutput[1].mt./*.**/right
+  )
+  LSTMoutput[1].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].ot : Sigmoid 0 x 0 (
+    LSTMoutput[1].ot.z
+  )
+  LSTMoutput[1].ot.z : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left
+    LSTMoutput[1].ot.z./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].output : Times 0 x 0 (
+    LSTMoutput[1].Wmr
+    LSTMoutput[1].output./***/right
+  )
+  LSTMoutput[1].output./***/right : Scale 0 x 0 (
+    LSTMoutput[1].output./***/right.scalarScalingFactor
+    LSTMoutput[1].mt
+  )
+  LSTMoutput[1].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutput[2].bft : ElementTimes 0 x 0 (
+    LSTMoutput[2].ft
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].bit : ElementTimes 0 x 0 (
+    LSTMoutput[2].it
+    LSTMoutput[2].bit./*.**/right
+  )
+  LSTMoutput[2].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z
+  )
+  LSTMoutput[2].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ct : Plus 0 x 0 (
+    LSTMoutput[2].bft
+    LSTMoutput[2].bit
+  )
+  LSTMoutput[2].dc : PastValue 1024 x 1 (
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].dh : PastValue 256 x 1 (
+    LSTMoutput[2].output
+  )
+  LSTMoutput[2].ft : Sigmoid 0 x 0 (
+    LSTMoutput[2].ft.z
+  )
+  LSTMoutput[2].ft.z : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left
+    LSTMoutput[2].ft.z./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it : Sigmoid 0 x 0 (
+    LSTMoutput[2].it.z
+  )
+  LSTMoutput[2].it.z : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left
+    LSTMoutput[2].it.z./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left
+    LSTMoutput[2].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].it.z./*+*/right.matrix
+  )
+  LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].mt : ElementTimes 0 x 0 (
+    LSTMoutput[2].ot
+    LSTMoutput[2].mt./*.**/right
+  )
+  LSTMoutput[2].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].ot : Sigmoid 0 x 0 (
+    LSTMoutput[2].ot.z
+  )
+  LSTMoutput[2].ot.z : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left
+    LSTMoutput[2].ot.z./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].output : Times 0 x 0 (
+    LSTMoutput[2].Wmr
+    LSTMoutput[2].output./***/right
+  )
+  LSTMoutput[2].output./***/right : Scale 0 x 0 (
+    LSTMoutput[2].output./***/right.scalarScalingFactor
+    LSTMoutput[2].mt
+  )
+  LSTMoutput[2].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutput[3].bft : ElementTimes 0 x 0 (
+    LSTMoutput[3].ft
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].bit : ElementTimes 0 x 0 (
+    LSTMoutput[3].it
+    LSTMoutput[3].bit./*.**/right
+  )
+  LSTMoutput[3].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z
+  )
+  LSTMoutput[3].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ct : Plus 0 x 0 (
+    LSTMoutput[3].bft
+    LSTMoutput[3].bit
+  )
+  LSTMoutput[3].dc : PastValue 1024 x 1 (
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].dh : PastValue 256 x 1 (
+    LSTMoutput[3].output
+  )
+  LSTMoutput[3].ft : Sigmoid 0 x 0 (
+    LSTMoutput[3].ft.z
+  )
+  LSTMoutput[3].ft.z : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left
+    LSTMoutput[3].ft.z./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it : Sigmoid 0 x 0 (
+    LSTMoutput[3].it.z
+  )
+  LSTMoutput[3].it.z : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left
+    LSTMoutput[3].it.z./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left
+    LSTMoutput[3].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].it.z./*+*/right.matrix
+  )
+  LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].mt : ElementTimes 0 x 0 (
+    LSTMoutput[3].ot
+    LSTMoutput[3].mt./*.**/right
+  )
+  LSTMoutput[3].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].ot : Sigmoid 0 x 0 (
+    LSTMoutput[3].ot.z
+  )
+  LSTMoutput[3].ot.z : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left
+    LSTMoutput[3].ot.z./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].output : Times 0 x 0 (
+    LSTMoutput[3].Wmr
+    LSTMoutput[3].output./***/right
+  )
+  LSTMoutput[3].output./***/right : Scale 0 x 0 (
+    LSTMoutput[3].output./***/right.scalarScalingFactor
+    LSTMoutput[3].mt
+  )
+  LSTMoutput[3].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutputW : Plus 0 x 0 (
+    LSTMoutputW./*+*/left
+    B
+  )
+  LSTMoutputW./*+*/left : Times 0 x 0 (
+    LSTMoutputW./*+*/left./***/left
+    LSTMoutputW./*+*/left./***/right
+  )
+  LSTMoutputW./*+*/left./***/left : LearnableParameter 132 x 256 ()
+  LSTMoutputW./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutputW./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[3].output
+  )
+  LSTMoutputW./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  ScaledLogLikelihood : Minus 0 x 0 (
+    LSTMoutputW
+    logPrior
+  )
+]
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
-Found 6 PreCompute nodes
-	NodeName: featNorm.xMean
-	NodeName: featNorm.xStdDev
-	NodeName: logPrior.Prior
-	NodeName: featNorm.xMean
-	NodeName: featNorm.xStdDev
-	NodeName: logPrior.Prior
+ nodes in the recurrent loops : 
+LSTMoutput[1].mt./*.**/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].mt./*.**/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].mt./*.**/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].mt./*.**/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].mt./*.**/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].mt./*.**/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
+
+Validating for node cr. 272 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr. 183 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr. 60 nodes to process in pass 3.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+127 out of 272 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> 3 PreCompute nodes found.
+
+	NodeName: featNorm.invStdDevVector
+	NodeName: featNorm.meanVector
+	NodeName: logPrior.x
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node featNorm.xMean 
+Validating for node featNorm.invStdDevVector. 3 nodes to process in pass 1.
 
-Validating --> features = InputValue -> [363, MBSize 308]
-Validating --> feashift = RowSlice(features[363, MBSize 308]) -> [33, MBSize 308]
-Validating --> featNorm.xMean = Mean(feashift[33, MBSize 308]) -> [33, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
 
+Validating for node featNorm.invStdDevVector, final verification.
 
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
 
 1 out of 3 nodes do not share the minibatch layout with the input data.
+
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node featNorm.xStdDev 
+Validating for node featNorm.meanVector. 3 nodes to process in pass 1.
 
-Validating --> features = InputValue -> [363, MBSize 308]
-Validating --> feashift = RowSlice(features[363, MBSize 308]) -> [33, MBSize 308]
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, MBSize 308]) -> [33, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
 
+Validating for node featNorm.meanVector, final verification.
 
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
 
 1 out of 3 nodes do not share the minibatch layout with the input data.
+
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node logPrior.Prior 
+Validating for node logPrior.x. 2 nodes to process in pass 1.
 
-Validating --> labels = InputValue -> [132, MBSize 308]
-Validating --> logPrior.Prior = Mean(labels[132, MBSize 308]) -> [132, 1]
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
 
+Validating for node logPrior.x. 1 nodes to process in pass 2.
 
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
+
+Validating for node logPrior.x, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
 
 1 out of 2 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> Completed.
+
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.025000  momentum = 0.000000 
+Starting Epoch 1: learning rate per sample = 0.025000  effective momentum = 0.000000 
 minibatchiterator: epoch 0: frames [0..2560] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+ nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
+
+Validating for node Err. 272 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 378]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 378]
+Validating --> feashift = RowSlice(features[363, MBSize 378]) -> [33, MBSize 378]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 378]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 378]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 378], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 378], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 378], LSTMoutput[1].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 378], LSTMoutput[1].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 378], LSTMoutput[1].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 378], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 378], LSTMoutput[2].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 378], LSTMoutput[2].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 378], LSTMoutput[2].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 378], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 378], LSTMoutput[3].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 378], LSTMoutput[3].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 378], LSTMoutput[3].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 378]) -> [132, MBSize 378]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 378], B[132, 1]) -> [132, MBSize 378]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 378], LSTMoutputW[132, MBSize 378]) -> [1, 1]
+
+Validating for node Err. 180 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 378]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 378]
+Validating --> feashift = RowSlice(features[363, MBSize 378]) -> [33, MBSize 378]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 378]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 378]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 378], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 378], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 378], LSTMoutput[1].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 378], LSTMoutput[1].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 378], LSTMoutput[1].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 378], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 378], LSTMoutput[2].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 378], LSTMoutput[2].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 378], LSTMoutput[2].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 378], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 378], LSTMoutput[3].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 378], LSTMoutput[3].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 378], LSTMoutput[3].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 378]) -> [132, MBSize 378]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 378], B[132, 1]) -> [132, MBSize 378]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 378], LSTMoutputW[132, MBSize 378]) -> [1, 1]
+
+Validating for node Err. 6 nodes to process in pass 3.
+
+Validating --> labels = InputValue -> [132, MBSize 378]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 378]
+Validating --> feashift = RowSlice(features[363, MBSize 378]) -> [33, MBSize 378]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 378]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 378]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 378], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 378], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 378], LSTMoutput[1].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 378], LSTMoutput[1].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 378], LSTMoutput[1].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 378], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 378], LSTMoutput[2].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 378], LSTMoutput[2].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 378], LSTMoutput[2].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 378], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 378], LSTMoutput[3].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 378], LSTMoutput[3].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 378], LSTMoutput[3].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 378]) -> [132, MBSize 378]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 378], B[132, 1]) -> [132, MBSize 378]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 378], LSTMoutputW[132, MBSize 378]) -> [1, 1]
+
+Validating for node Err, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 378]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 378]
+Validating --> feashift = RowSlice(features[363, MBSize 378]) -> [33, MBSize 378]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 378]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 378]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 378], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 378]) -> [33, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 378], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 378], LSTMoutput[1].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 378], LSTMoutput[1].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 378], LSTMoutput[1].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 378], LSTMoutput[1].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 378], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 378], LSTMoutput[2].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 378], LSTMoutput[2].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 378], LSTMoutput[2].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 378], LSTMoutput[2].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 378], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 378], LSTMoutput[3].it.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 378], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 378], LSTMoutput[3].bit./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 378], LSTMoutput[3].bit[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 378], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 378], LSTMoutput[3].mt./*.**/right[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 378]) -> [1024, MBSize 378]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 378]) -> [256, MBSize 378]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 378]) -> [132, MBSize 378]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 378], B[132, 1]) -> [132, MBSize 378]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 378], LSTMoutputW[132, MBSize 378]) -> [1, 1]
+
+127 out of 272 nodes do not share the minibatch layout with the input data.
+
 
 Starting minibatch loop.
- Epoch[ 1 of 2]-Minibatch[   1-   1 of 128]: SamplesSeen = 308; TrainLossPerSample =  4.88306536; EvalErr[0]PerSample = 0.99025974; TotalTime = 22.51031s; TotalTimePerSample = 73.08543ms; SamplesPerSecond = 13
- Epoch[ 1 of 2]-Minibatch[   2-   2 of 128]: SamplesSeen = 408; TrainLossPerSample =  4.62618899; EvalErr[0]PerSample = 0.84068627; TotalTime = 32.29271s; TotalTimePerSample = 79.14880ms; SamplesPerSecond = 12
- Epoch[ 1 of 2]-Minibatch[   3-   3 of 128]: SamplesSeen = 378; TrainLossPerSample =  4.56298699; EvalErr[0]PerSample = 0.94708995; TotalTime = 28.78492s; TotalTimePerSample = 76.15060ms; SamplesPerSecond = 13
- Epoch[ 1 of 2]-Minibatch[   4-   4 of 128]: SamplesSeen = 478; TrainLossPerSample =  3.76631656; EvalErr[0]PerSample = 0.78661088; TotalTime = 37.13984s; TotalTimePerSample = 77.69841ms; SamplesPerSecond = 12
- Epoch[ 1 of 2]-Minibatch[   5-   5 of 128]: SamplesSeen = 158; TrainLossPerSample = 8763.14191369; EvalErr[0]PerSample = 0.93037975; TotalTime = 11.48446s; TotalTimePerSample = 72.68647ms; SamplesPerSecond = 13
- Epoch[ 1 of 2]-Minibatch[   6-   6 of 128]: SamplesSeen = 258; TrainLossPerSample =  4.56298450; EvalErr[0]PerSample = 0.89922481; TotalTime = 21.40300s; TotalTimePerSample = 82.95738ms; SamplesPerSecond = 12
- Epoch[ 1 of 2]-Minibatch[   7-   7 of 128]: SamplesSeen = 328; TrainLossPerSample =  4.37957317; EvalErr[0]PerSample = 0.87500000; TotalTime = 26.18570s; TotalTimePerSample = 79.83445ms; SamplesPerSecond = 12
- Epoch[ 1 of 2]-Minibatch[   8-   8 of 128]: SamplesSeen = 288; TrainLossPerSample =  4.65104167; EvalErr[0]PerSample = 0.90625000; TotalTime = 19.35972s; TotalTimePerSample = 67.22124ms; SamplesPerSecond = 14
-Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 535.88568; EvalErrPerSample = 0.88671273; Ave LearnRatePerSample = 0.02500000037; EpochTime=199.19972
-Starting Epoch 2: learning rate per sample = 0.025000  momentum = 0.900000 
+ Epoch[ 1 of 2]-Minibatch[   1-   1 of 128]: SamplesSeen = 308; TrainLossPerSample =  4.88306536; EvalErr[0]PerSample = 0.99025974; TotalTime = 33.19239s; TotalTimePerSample = 107.76749ms; SamplesPerSecond = 9
+ Epoch[ 1 of 2]-Minibatch[   2-   2 of 128]: SamplesSeen = 408; TrainLossPerSample =  4.62618899; EvalErr[0]PerSample = 0.84068627; TotalTime = 47.40593s; TotalTimePerSample = 116.19102ms; SamplesPerSecond = 8
+ Epoch[ 1 of 2]-Minibatch[   3-   3 of 128]: SamplesSeen = 378; TrainLossPerSample =  4.56298570; EvalErr[0]PerSample = 0.94708995; TotalTime = 45.36358s; TotalTimePerSample = 120.00946ms; SamplesPerSecond = 8
+ Epoch[ 1 of 2]-Minibatch[   4-   4 of 128]: SamplesSeen = 478; TrainLossPerSample =  3.76631656; EvalErr[0]PerSample = 0.78661088; TotalTime = 56.13956s; TotalTimePerSample = 117.44677ms; SamplesPerSecond = 8
+ Epoch[ 1 of 2]-Minibatch[   5-   5 of 128]: SamplesSeen = 158; TrainLossPerSample = 8763.14508134; EvalErr[0]PerSample = 0.93037975; TotalTime = 16.77154s; TotalTimePerSample = 106.14897ms; SamplesPerSecond = 9
+ Epoch[ 1 of 2]-Minibatch[   6-   6 of 128]: SamplesSeen = 258; TrainLossPerSample =  4.56298450; EvalErr[0]PerSample = 0.89922481; TotalTime = 24.72391s; TotalTimePerSample = 95.82912ms; SamplesPerSecond = 10
+ Epoch[ 1 of 2]-Minibatch[   7-   7 of 128]: SamplesSeen = 328; TrainLossPerSample =  4.37957317; EvalErr[0]PerSample = 0.87500000; TotalTime = 28.38026s; TotalTimePerSample = 86.52517ms; SamplesPerSecond = 11
+ Epoch[ 1 of 2]-Minibatch[   8-   8 of 128]: SamplesSeen = 288; TrainLossPerSample =  4.65104167; EvalErr[0]PerSample = 0.90625000; TotalTime = 25.08192s; TotalTimePerSample = 87.09000ms; SamplesPerSecond = 11
+Finished Epoch[ 1 of 2]: [Training Set] TrainLossPerSample = 535.88586; EvalErrPerSample = 0.88671273; Ave LearnRatePerSample = 0.02500000037; EpochTime=277.708
+Starting Epoch 2: learning rate per sample = 0.025000  effective momentum = 0.900000 
 minibatchiterator: epoch 1: frames [2560..5120] (first utterance at frame 2604), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 2]-Minibatch[   1-   1 of 128]: SamplesSeen = 508; TrainLossPerSample =  4.26512518; EvalErr[0]PerSample = 0.85433071; TotalTime = 41.28329s; TotalTimePerSample = 81.26632ms; SamplesPerSecond = 12
- Epoch[ 2 of 2]-Minibatch[   2-   2 of 128]: SamplesSeen = 228; TrainLossPerSample =  3.77295993; EvalErr[0]PerSample = 0.82456140; TotalTime = 14.96108s; TotalTimePerSample = 65.61879ms; SamplesPerSecond = 15
- Epoch[ 2 of 2]-Minibatch[   3-   3 of 128]: SamplesSeen = 88; TrainLossPerSample =  3.83270264; EvalErr[0]PerSample = 0.89772727; TotalTime = 5.98518s; TotalTimePerSample = 68.01345ms; SamplesPerSecond = 14
- Epoch[ 2 of 2]-Minibatch[   4-   4 of 128]: SamplesSeen = 208; TrainLossPerSample =  4.20982009; EvalErr[0]PerSample = 0.91826923; TotalTime = 13.63850s; TotalTimePerSample = 65.56970ms; SamplesPerSecond = 15
- Epoch[ 2 of 2]-Minibatch[   5-   5 of 128]: SamplesSeen = 198; TrainLossPerSample =  4.20819277; EvalErr[0]PerSample = 0.91919192; TotalTime = 13.50887s; TotalTimePerSample = 68.22662ms; SamplesPerSecond = 14
- Epoch[ 2 of 2]-Minibatch[   6-   6 of 128]: SamplesSeen = 458; TrainLossPerSample =  3.93088581; EvalErr[0]PerSample = 0.93231441; TotalTime = 33.74585s; TotalTimePerSample = 73.68089ms; SamplesPerSecond = 13
- Epoch[ 2 of 2]-Minibatch[   7-   7 of 128]: SamplesSeen = 258; TrainLossPerSample =  3.87346513; EvalErr[0]PerSample = 0.91860465; TotalTime = 21.74937s; TotalTimePerSample = 84.29989ms; SamplesPerSecond = 11
- Epoch[ 2 of 2]-Minibatch[   8-   8 of 128]: SamplesSeen = 218; TrainLossPerSample =  3.73194703; EvalErr[0]PerSample = 0.79816514; TotalTime = 16.44576s; TotalTimePerSample = 75.43928ms; SamplesPerSecond = 13
- Epoch[ 2 of 2]-Minibatch[   9-   9 of 128]: SamplesSeen = 238; TrainLossPerSample =  3.93201402; EvalErr[0]PerSample = 0.81932773; TotalTime = 19.53173s; TotalTimePerSample = 82.06611ms; SamplesPerSecond = 12
- Epoch[ 2 of 2]-Minibatch[  10-  10 of 128]: SamplesSeen = 248; TrainLossPerSample =  4.68575164; EvalErr[0]PerSample = 0.92741935; TotalTime = 19.55488s; TotalTimePerSample = 78.85032ms; SamplesPerSecond = 12
-Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 4.0695133; EvalErrPerSample = 0.88188678; Ave LearnRatePerSample = 0.02500000037; EpochTime=200.45244
+ Epoch[ 2 of 2]-Minibatch[   1-   1 of 128]: SamplesSeen = 508; TrainLossPerSample =  4.26512518; EvalErr[0]PerSample = 0.85433071; TotalTime = 47.12508s; TotalTimePerSample = 92.76590ms; SamplesPerSecond = 10
+ Epoch[ 2 of 2]-Minibatch[   2-   2 of 128]: SamplesSeen = 228; TrainLossPerSample =  3.77295993; EvalErr[0]PerSample = 0.82456140; TotalTime = 20.64559s; TotalTimePerSample = 90.55082ms; SamplesPerSecond = 11
+ Epoch[ 2 of 2]-Minibatch[   3-   3 of 128]: SamplesSeen = 88; TrainLossPerSample =  3.83270264; EvalErr[0]PerSample = 0.89772727; TotalTime = 7.97201s; TotalTimePerSample = 90.59103ms; SamplesPerSecond = 11
+ Epoch[ 2 of 2]-Minibatch[   4-   4 of 128]: SamplesSeen = 208; TrainLossPerSample =  4.20982009; EvalErr[0]PerSample = 0.91826923; TotalTime = 16.50900s; TotalTimePerSample = 79.37017ms; SamplesPerSecond = 12
+ Epoch[ 2 of 2]-Minibatch[   5-   5 of 128]: SamplesSeen = 198; TrainLossPerSample =  4.20819523; EvalErr[0]PerSample = 0.91919192; TotalTime = 16.28937s; TotalTimePerSample = 82.26952ms; SamplesPerSecond = 12
+ Epoch[ 2 of 2]-Minibatch[   6-   6 of 128]: SamplesSeen = 458; TrainLossPerSample =  3.93088581; EvalErr[0]PerSample = 0.93231441; TotalTime = 40.95506s; TotalTimePerSample = 89.42153ms; SamplesPerSecond = 11
+ Epoch[ 2 of 2]-Minibatch[   7-   7 of 128]: SamplesSeen = 258; TrainLossPerSample =  3.87346892; EvalErr[0]PerSample = 0.91860465; TotalTime = 21.19852s; TotalTimePerSample = 82.16480ms; SamplesPerSecond = 12
+ Epoch[ 2 of 2]-Minibatch[   8-   8 of 128]: SamplesSeen = 218; TrainLossPerSample =  3.73194927; EvalErr[0]PerSample = 0.79816514; TotalTime = 18.45402s; TotalTimePerSample = 84.65144ms; SamplesPerSecond = 11
+ Epoch[ 2 of 2]-Minibatch[   9-   9 of 128]: SamplesSeen = 238; TrainLossPerSample =  3.93201402; EvalErr[0]PerSample = 0.81932773; TotalTime = 19.20098s; TotalTimePerSample = 80.67637ms; SamplesPerSecond = 12
+ Epoch[ 2 of 2]-Minibatch[  10-  10 of 128]: SamplesSeen = 248; TrainLossPerSample =  4.68575164; EvalErr[0]PerSample = 0.92741935; TotalTime = 23.74306s; TotalTimePerSample = 95.73816ms; SamplesPerSecond = 10
+Finished Epoch[ 2 of 2]: [Training Set] TrainLossPerSample = 4.0695143; EvalErrPerSample = 0.88188678; Ave LearnRatePerSample = 0.02500000037; EpochTime=232.15937
 CNTKCommandTrainEnd: speechTrain
 COMPLETED
diff --git a/Tests/Speech/LSTM/Truncated/baseline.gpu.txt b/Tests/Speech/LSTM/Truncated/baseline.gpu.txt
index bad10f355..4f3677b81 100644
--- a/Tests/Speech/LSTM/Truncated/baseline.gpu.txt
+++ b/Tests/Speech/LSTM/Truncated/baseline.gpu.txt
@@ -1,7 +1,7 @@
-=== Running /home/mluser/src/cplx_master/build/release/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/cntk.config RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM
-running on localhost at 2015/09/08 12:56:03
-command line options: 
-configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/cntk.config RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data DeviceId=0 NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM 
+=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/../cntk.config RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/.. DeviceId=0
+running on localhost at 2015/10/24 13:03:38
+command line: 
+/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/../cntk.config RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/.. DeviceId=0 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
@@ -15,9 +15,6 @@ speechTrain=[
     modelPath=$RunDir$/models/cntkSpeech.dnn
     deviceId=$DeviceId$
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -191,10 +188,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ]
-RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu
+RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu
 DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/..
 DeviceId=0
-NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
@@ -207,12 +204,9 @@ frameMode=false
 Truncated=true
 speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu/models/cntkSpeech.dnn
+    modelPath=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn
     deviceId=0
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/LSTM/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -386,30 +380,27 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ]
-RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu
+RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu
 DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/..
 DeviceId=0
-NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: cntk.config:command=speechTrain
+configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM/Truncated/..
 configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
 configparameters: cntk.config:deviceId=0
 configparameters: cntk.config:frameMode=false
-configparameters: cntk.config:NDLDir=/home/mluser/src/cplx_master/Tests/Speech/LSTM
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu
+configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu
 configparameters: cntk.config:speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20150908125603.612544/Speech_LSTM@release_gpu/models/cntkSpeech.dnn
+    modelPath=/tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn
     deviceId=0
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=/home/mluser/src/cplx_master/Tests/Speech/LSTM/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -588,7 +579,11 @@ configparameters: cntk.config:Truncated=true
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: speechTrain 
 precision = float
-NDLBuilder Using GPU 0
+CNTKModelPath: /tmp/cntk-test-20151024130338.510376/Speech/LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 4
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+CNTKCommandTrainBegin: speechTrain
+ExperimentalNetworkBuilder using GPU 0
 reading script file /home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.scp ... 948 entries
 trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
 total 132 state names in state list /home/mluser/src/cplx_master/Tests/Speech/Data/state.list
@@ -596,1781 +591,3382 @@ htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/gl
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
- nodes in the recurrent loops : 
-LSTMoutput1.unnamed174	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.bit	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.unnamed224	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.bit	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.unnamed274	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.bit	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.unnamed174	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.bit	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.unnamed224	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.bit	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.unnamed274	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.bit	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Printing Gradient Computation Node Order ... 
-
-cr[0, 0] = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[0, 0])
-LSTMoutputW[0, 0] = Plus(unnamed283[0, 0], b[132, 1])
-b[132, 1] = LearnableParameter
-unnamed283[0, 0] = Times(W[132, 256], unnamed284[0, 0])
-unnamed284[0, 0] = Scale(expsW[0, 0], LSTMoutput3.output[0, 0])
-LSTMoutput3.output[0, 0] = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[0, 0])
-LSTMoutput3.unnamed275[0, 0] = Scale(LSTMoutput3.expsWmr[0, 0], LSTMoutput3.mt[0, 0])
-LSTMoutput3.mt[0, 0] = ElementTimes(LSTMoutput3.ot[0, 0], LSTMoutput3.unnamed274[0, 0])
-LSTMoutput3.unnamed274[0, 0] = Tanh(LSTMoutput3.ct[0, 0])
-LSTMoutput3.ot[0, 0] = Sigmoid(LSTMoutput3.unnamed271[0, 0])
-LSTMoutput3.unnamed271[0, 0] = Plus(LSTMoutput3.unnamed272[0, 0], LSTMoutput3.Wcoct[0, 0])
-LSTMoutput3.Wcoct[0, 0] = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[0, 0])
-LSTMoutput3.unnamed270[0, 0] = Scale(LSTMoutput3.expsWco[0, 0], LSTMoutput3.ct[0, 0])
-LSTMoutput3.ct[0, 0] = Plus(LSTMoutput3.bft[0, 0], LSTMoutput3.bit[0, 0])
-LSTMoutput3.bit[0, 0] = ElementTimes(LSTMoutput3.it[0, 0], LSTMoutput3.unnamed259[0, 0])
-LSTMoutput3.unnamed259[0, 0] = Tanh(LSTMoutput3.unnamed260[0, 0])
-LSTMoutput3.unnamed260[0, 0] = Plus(LSTMoutput3.Wxcx[0, 0], LSTMoutput3.unnamed261[0, 0])
-LSTMoutput3.unnamed261[0, 0] = Plus(LSTMoutput3.Whcdh[0, 0], LSTMoutput3.bc[1024, 1])
-LSTMoutput3.Whcdh[0, 0] = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[0, 0])
-LSTMoutput3.unnamed258[0, 0] = Scale(LSTMoutput3.expsWhc[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.it[0, 0] = Sigmoid(LSTMoutput3.unnamed254[0, 0])
-LSTMoutput3.unnamed254[0, 0] = Plus(LSTMoutput3.unnamed255[0, 0], LSTMoutput3.Wcidc[0, 0])
-LSTMoutput3.Wcidc[0, 0] = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[0, 0])
-LSTMoutput3.unnamed253[0, 0] = Scale(LSTMoutput3.expsWci[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.unnamed255[0, 0] = Plus(LSTMoutput3.unnamed256[0, 0], LSTMoutput3.Whidh[0, 0])
-LSTMoutput3.Whidh[0, 0] = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[0, 0])
-LSTMoutput3.unnamed252[0, 0] = Scale(LSTMoutput3.expsWhi[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.bft[0, 0] = ElementTimes(LSTMoutput3.ft[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.ft[0, 0] = Sigmoid(LSTMoutput3.unnamed265[0, 0])
-LSTMoutput3.unnamed265[0, 0] = Plus(LSTMoutput3.unnamed266[0, 0], LSTMoutput3.Wcfdc[0, 0])
-LSTMoutput3.Wcfdc[0, 0] = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[0, 0])
-LSTMoutput3.unnamed264[0, 0] = Scale(LSTMoutput3.expsWcf[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.dc[1024, 1] = PastValue(LSTMoutput3.ct[0, 0])
-LSTMoutput3.unnamed266[0, 0] = Plus(LSTMoutput3.unnamed267[0, 0], LSTMoutput3.Whfdh[0, 0])
-LSTMoutput3.Whfdh[0, 0] = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[0, 0])
-LSTMoutput3.unnamed263[0, 0] = Scale(LSTMoutput3.expsWhf[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.unnamed272[0, 0] = Plus(LSTMoutput3.unnamed273[0, 0], LSTMoutput3.Whodh[0, 0])
-LSTMoutput3.Whodh[0, 0] = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[0, 0])
-LSTMoutput3.unnamed269[0, 0] = Scale(LSTMoutput3.expsWho[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.dh[256, 1] = PastValue(LSTMoutput3.output[0, 0])
-LSTMoutput3.bc[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhc[0, 0] = Exp(LSTMoutput3.sWhc[1, 1])
-LSTMoutput3.sWhc[1, 1] = LearnableParameter
-LSTMoutput3.Whc[1024, 256] = LearnableParameter
-LSTMoutput3.Wxcx[0, 0] = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[0, 0])
-LSTMoutput3.unnamed257[0, 0] = Scale(LSTMoutput3.expsWxc[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxc[0, 0] = Exp(LSTMoutput3.sWxc[1, 1])
-LSTMoutput3.sWxc[1, 1] = LearnableParameter
-LSTMoutput3.Wxc[1024, 256] = LearnableParameter
-LSTMoutput3.expsWci[0, 0] = Exp(LSTMoutput3.sWci[1, 1])
-LSTMoutput3.sWci[1, 1] = LearnableParameter
-LSTMoutput3.Wci[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhi[0, 0] = Exp(LSTMoutput3.sWhi[1, 1])
-LSTMoutput3.sWhi[1, 1] = LearnableParameter
-LSTMoutput3.Whi[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed256[0, 0] = Plus(LSTMoutput3.Wxix[0, 0], LSTMoutput3.bi[1024, 1])
-LSTMoutput3.bi[1024, 1] = LearnableParameter
-LSTMoutput3.Wxix[0, 0] = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[0, 0])
-LSTMoutput3.unnamed251[0, 0] = Scale(LSTMoutput3.expsWxi[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxi[0, 0] = Exp(LSTMoutput3.sWxi[1, 1])
-LSTMoutput3.sWxi[1, 1] = LearnableParameter
-LSTMoutput3.Wxi[1024, 256] = LearnableParameter
-LSTMoutput3.expsWcf[0, 0] = Exp(LSTMoutput3.sWcf[1, 1])
-LSTMoutput3.sWcf[1, 1] = LearnableParameter
-LSTMoutput3.Wcf[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhf[0, 0] = Exp(LSTMoutput3.sWhf[1, 1])
-LSTMoutput3.sWhf[1, 1] = LearnableParameter
-LSTMoutput3.Whf[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed267[0, 0] = Plus(LSTMoutput3.Wxfx[0, 0], LSTMoutput3.bf[1024, 1])
-LSTMoutput3.bf[1024, 1] = LearnableParameter
-LSTMoutput3.Wxfx[0, 0] = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[0, 0])
-LSTMoutput3.unnamed262[0, 0] = Scale(LSTMoutput3.expsWxf[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxf[0, 0] = Exp(LSTMoutput3.sWxf[1, 1])
-LSTMoutput3.sWxf[1, 1] = LearnableParameter
-LSTMoutput3.Wxf[1024, 256] = LearnableParameter
-LSTMoutput3.expsWco[0, 0] = Exp(LSTMoutput3.sWco[1, 1])
-LSTMoutput3.sWco[1, 1] = LearnableParameter
-LSTMoutput3.Wco[1024, 1] = LearnableParameter
-LSTMoutput3.expsWho[0, 0] = Exp(LSTMoutput3.sWho[1, 1])
-LSTMoutput3.sWho[1, 1] = LearnableParameter
-LSTMoutput3.Who[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed273[0, 0] = Plus(LSTMoutput3.Wxox[0, 0], LSTMoutput3.bo[1024, 1])
-LSTMoutput3.bo[1024, 1] = LearnableParameter
-LSTMoutput3.Wxox[0, 0] = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[0, 0])
-LSTMoutput3.unnamed268[0, 0] = Scale(LSTMoutput3.expsWxo[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput2.output[0, 0] = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[0, 0])
-LSTMoutput2.unnamed225[0, 0] = Scale(LSTMoutput2.expsWmr[0, 0], LSTMoutput2.mt[0, 0])
-LSTMoutput2.mt[0, 0] = ElementTimes(LSTMoutput2.ot[0, 0], LSTMoutput2.unnamed224[0, 0])
-LSTMoutput2.unnamed224[0, 0] = Tanh(LSTMoutput2.ct[0, 0])
-LSTMoutput2.ot[0, 0] = Sigmoid(LSTMoutput2.unnamed221[0, 0])
-LSTMoutput2.unnamed221[0, 0] = Plus(LSTMoutput2.unnamed222[0, 0], LSTMoutput2.Wcoct[0, 0])
-LSTMoutput2.Wcoct[0, 0] = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[0, 0])
-LSTMoutput2.unnamed220[0, 0] = Scale(LSTMoutput2.expsWco[0, 0], LSTMoutput2.ct[0, 0])
-LSTMoutput2.ct[0, 0] = Plus(LSTMoutput2.bft[0, 0], LSTMoutput2.bit[0, 0])
-LSTMoutput2.bit[0, 0] = ElementTimes(LSTMoutput2.it[0, 0], LSTMoutput2.unnamed209[0, 0])
-LSTMoutput2.unnamed209[0, 0] = Tanh(LSTMoutput2.unnamed210[0, 0])
-LSTMoutput2.unnamed210[0, 0] = Plus(LSTMoutput2.Wxcx[0, 0], LSTMoutput2.unnamed211[0, 0])
-LSTMoutput2.unnamed211[0, 0] = Plus(LSTMoutput2.Whcdh[0, 0], LSTMoutput2.bc[1024, 1])
-LSTMoutput2.Whcdh[0, 0] = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[0, 0])
-LSTMoutput2.unnamed208[0, 0] = Scale(LSTMoutput2.expsWhc[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.it[0, 0] = Sigmoid(LSTMoutput2.unnamed204[0, 0])
-LSTMoutput2.unnamed204[0, 0] = Plus(LSTMoutput2.unnamed205[0, 0], LSTMoutput2.Wcidc[0, 0])
-LSTMoutput2.Wcidc[0, 0] = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[0, 0])
-LSTMoutput2.unnamed203[0, 0] = Scale(LSTMoutput2.expsWci[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.unnamed205[0, 0] = Plus(LSTMoutput2.unnamed206[0, 0], LSTMoutput2.Whidh[0, 0])
-LSTMoutput2.Whidh[0, 0] = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[0, 0])
-LSTMoutput2.unnamed202[0, 0] = Scale(LSTMoutput2.expsWhi[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.bft[0, 0] = ElementTimes(LSTMoutput2.ft[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.ft[0, 0] = Sigmoid(LSTMoutput2.unnamed215[0, 0])
-LSTMoutput2.unnamed215[0, 0] = Plus(LSTMoutput2.unnamed216[0, 0], LSTMoutput2.Wcfdc[0, 0])
-LSTMoutput2.Wcfdc[0, 0] = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[0, 0])
-LSTMoutput2.unnamed214[0, 0] = Scale(LSTMoutput2.expsWcf[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.dc[1024, 1] = PastValue(LSTMoutput2.ct[0, 0])
-LSTMoutput2.unnamed216[0, 0] = Plus(LSTMoutput2.unnamed217[0, 0], LSTMoutput2.Whfdh[0, 0])
-LSTMoutput2.Whfdh[0, 0] = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[0, 0])
-LSTMoutput2.unnamed213[0, 0] = Scale(LSTMoutput2.expsWhf[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.unnamed222[0, 0] = Plus(LSTMoutput2.unnamed223[0, 0], LSTMoutput2.Whodh[0, 0])
-LSTMoutput2.Whodh[0, 0] = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[0, 0])
-LSTMoutput2.unnamed219[0, 0] = Scale(LSTMoutput2.expsWho[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.dh[256, 1] = PastValue(LSTMoutput2.output[0, 0])
-LSTMoutput2.bc[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhc[0, 0] = Exp(LSTMoutput2.sWhc[1, 1])
-LSTMoutput2.sWhc[1, 1] = LearnableParameter
-LSTMoutput2.Whc[1024, 256] = LearnableParameter
-LSTMoutput2.Wxcx[0, 0] = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[0, 0])
-LSTMoutput2.unnamed207[0, 0] = Scale(LSTMoutput2.expsWxc[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxc[0, 0] = Exp(LSTMoutput2.sWxc[1, 1])
-LSTMoutput2.sWxc[1, 1] = LearnableParameter
-LSTMoutput2.Wxc[1024, 256] = LearnableParameter
-LSTMoutput2.expsWci[0, 0] = Exp(LSTMoutput2.sWci[1, 1])
-LSTMoutput2.sWci[1, 1] = LearnableParameter
-LSTMoutput2.Wci[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhi[0, 0] = Exp(LSTMoutput2.sWhi[1, 1])
-LSTMoutput2.sWhi[1, 1] = LearnableParameter
-LSTMoutput2.Whi[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed206[0, 0] = Plus(LSTMoutput2.Wxix[0, 0], LSTMoutput2.bi[1024, 1])
-LSTMoutput2.bi[1024, 1] = LearnableParameter
-LSTMoutput2.Wxix[0, 0] = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[0, 0])
-LSTMoutput2.unnamed201[0, 0] = Scale(LSTMoutput2.expsWxi[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxi[0, 0] = Exp(LSTMoutput2.sWxi[1, 1])
-LSTMoutput2.sWxi[1, 1] = LearnableParameter
-LSTMoutput2.Wxi[1024, 256] = LearnableParameter
-LSTMoutput2.expsWcf[0, 0] = Exp(LSTMoutput2.sWcf[1, 1])
-LSTMoutput2.sWcf[1, 1] = LearnableParameter
-LSTMoutput2.Wcf[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhf[0, 0] = Exp(LSTMoutput2.sWhf[1, 1])
-LSTMoutput2.sWhf[1, 1] = LearnableParameter
-LSTMoutput2.Whf[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed217[0, 0] = Plus(LSTMoutput2.Wxfx[0, 0], LSTMoutput2.bf[1024, 1])
-LSTMoutput2.bf[1024, 1] = LearnableParameter
-LSTMoutput2.Wxfx[0, 0] = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[0, 0])
-LSTMoutput2.unnamed212[0, 0] = Scale(LSTMoutput2.expsWxf[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxf[0, 0] = Exp(LSTMoutput2.sWxf[1, 1])
-LSTMoutput2.sWxf[1, 1] = LearnableParameter
-LSTMoutput2.Wxf[1024, 256] = LearnableParameter
-LSTMoutput2.expsWco[0, 0] = Exp(LSTMoutput2.sWco[1, 1])
-LSTMoutput2.sWco[1, 1] = LearnableParameter
-LSTMoutput2.Wco[1024, 1] = LearnableParameter
-LSTMoutput2.expsWho[0, 0] = Exp(LSTMoutput2.sWho[1, 1])
-LSTMoutput2.sWho[1, 1] = LearnableParameter
-LSTMoutput2.Who[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed223[0, 0] = Plus(LSTMoutput2.Wxox[0, 0], LSTMoutput2.bo[1024, 1])
-LSTMoutput2.bo[1024, 1] = LearnableParameter
-LSTMoutput2.Wxox[0, 0] = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[0, 0])
-LSTMoutput2.unnamed218[0, 0] = Scale(LSTMoutput2.expsWxo[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput1.output[0, 0] = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[0, 0])
-LSTMoutput1.unnamed175[0, 0] = Scale(LSTMoutput1.expsWmr[0, 0], LSTMoutput1.mt[0, 0])
-LSTMoutput1.mt[0, 0] = ElementTimes(LSTMoutput1.ot[0, 0], LSTMoutput1.unnamed174[0, 0])
-LSTMoutput1.unnamed174[0, 0] = Tanh(LSTMoutput1.ct[0, 0])
-LSTMoutput1.ot[0, 0] = Sigmoid(LSTMoutput1.unnamed171[0, 0])
-LSTMoutput1.unnamed171[0, 0] = Plus(LSTMoutput1.unnamed172[0, 0], LSTMoutput1.Wcoct[0, 0])
-LSTMoutput1.Wcoct[0, 0] = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[0, 0])
-LSTMoutput1.unnamed170[0, 0] = Scale(LSTMoutput1.expsWco[0, 0], LSTMoutput1.ct[0, 0])
-LSTMoutput1.ct[0, 0] = Plus(LSTMoutput1.bft[0, 0], LSTMoutput1.bit[0, 0])
-LSTMoutput1.bit[0, 0] = ElementTimes(LSTMoutput1.it[0, 0], LSTMoutput1.unnamed159[0, 0])
-LSTMoutput1.unnamed159[0, 0] = Tanh(LSTMoutput1.unnamed160[0, 0])
-LSTMoutput1.unnamed160[0, 0] = Plus(LSTMoutput1.Wxcx[0, 0], LSTMoutput1.unnamed161[0, 0])
-LSTMoutput1.unnamed161[0, 0] = Plus(LSTMoutput1.Whcdh[0, 0], LSTMoutput1.bc[1024, 1])
-LSTMoutput1.Whcdh[0, 0] = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[0, 0])
-LSTMoutput1.unnamed158[0, 0] = Scale(LSTMoutput1.expsWhc[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.it[0, 0] = Sigmoid(LSTMoutput1.unnamed154[0, 0])
-LSTMoutput1.unnamed154[0, 0] = Plus(LSTMoutput1.unnamed155[0, 0], LSTMoutput1.Wcidc[0, 0])
-LSTMoutput1.Wcidc[0, 0] = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[0, 0])
-LSTMoutput1.unnamed153[0, 0] = Scale(LSTMoutput1.expsWci[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.unnamed155[0, 0] = Plus(LSTMoutput1.unnamed156[0, 0], LSTMoutput1.Whidh[0, 0])
-LSTMoutput1.Whidh[0, 0] = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[0, 0])
-LSTMoutput1.unnamed152[0, 0] = Scale(LSTMoutput1.expsWhi[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.bft[0, 0] = ElementTimes(LSTMoutput1.ft[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.ft[0, 0] = Sigmoid(LSTMoutput1.unnamed165[0, 0])
-LSTMoutput1.unnamed165[0, 0] = Plus(LSTMoutput1.unnamed166[0, 0], LSTMoutput1.Wcfdc[0, 0])
-LSTMoutput1.Wcfdc[0, 0] = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[0, 0])
-LSTMoutput1.unnamed164[0, 0] = Scale(LSTMoutput1.expsWcf[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.dc[1024, 1] = PastValue(LSTMoutput1.ct[0, 0])
-LSTMoutput1.unnamed166[0, 0] = Plus(LSTMoutput1.unnamed167[0, 0], LSTMoutput1.Whfdh[0, 0])
-LSTMoutput1.Whfdh[0, 0] = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[0, 0])
-LSTMoutput1.unnamed163[0, 0] = Scale(LSTMoutput1.expsWhf[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.unnamed172[0, 0] = Plus(LSTMoutput1.unnamed173[0, 0], LSTMoutput1.Whodh[0, 0])
-LSTMoutput1.Whodh[0, 0] = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[0, 0])
-LSTMoutput1.unnamed169[0, 0] = Scale(LSTMoutput1.expsWho[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.dh[256, 1] = PastValue(LSTMoutput1.output[0, 0])
-LSTMoutput1.bc[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhc[0, 0] = Exp(LSTMoutput1.sWhc[1, 1])
-LSTMoutput1.sWhc[1, 1] = LearnableParameter
-LSTMoutput1.Whc[1024, 256] = LearnableParameter
-LSTMoutput1.Wxcx[0, 0] = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[0, 0])
-LSTMoutput1.unnamed157[0, 0] = Scale(LSTMoutput1.expsWxc[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxc[0, 0] = Exp(LSTMoutput1.sWxc[1, 1])
-LSTMoutput1.sWxc[1, 1] = LearnableParameter
-LSTMoutput1.Wxc[1024, 33] = LearnableParameter
-LSTMoutput1.expsWci[0, 0] = Exp(LSTMoutput1.sWci[1, 1])
-LSTMoutput1.sWci[1, 1] = LearnableParameter
-LSTMoutput1.Wci[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhi[0, 0] = Exp(LSTMoutput1.sWhi[1, 1])
-LSTMoutput1.sWhi[1, 1] = LearnableParameter
-LSTMoutput1.Whi[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed156[0, 0] = Plus(LSTMoutput1.Wxix[0, 0], LSTMoutput1.bi[1024, 1])
-LSTMoutput1.bi[1024, 1] = LearnableParameter
-LSTMoutput1.Wxix[0, 0] = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[0, 0])
-LSTMoutput1.unnamed151[0, 0] = Scale(LSTMoutput1.expsWxi[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxi[0, 0] = Exp(LSTMoutput1.sWxi[1, 1])
-LSTMoutput1.sWxi[1, 1] = LearnableParameter
-LSTMoutput1.Wxi[1024, 33] = LearnableParameter
-LSTMoutput1.expsWcf[0, 0] = Exp(LSTMoutput1.sWcf[1, 1])
-LSTMoutput1.sWcf[1, 1] = LearnableParameter
-LSTMoutput1.Wcf[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhf[0, 0] = Exp(LSTMoutput1.sWhf[1, 1])
-LSTMoutput1.sWhf[1, 1] = LearnableParameter
-LSTMoutput1.Whf[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed167[0, 0] = Plus(LSTMoutput1.Wxfx[0, 0], LSTMoutput1.bf[1024, 1])
-LSTMoutput1.bf[1024, 1] = LearnableParameter
-LSTMoutput1.Wxfx[0, 0] = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[0, 0])
-LSTMoutput1.unnamed162[0, 0] = Scale(LSTMoutput1.expsWxf[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxf[0, 0] = Exp(LSTMoutput1.sWxf[1, 1])
-LSTMoutput1.sWxf[1, 1] = LearnableParameter
-LSTMoutput1.Wxf[1024, 33] = LearnableParameter
-LSTMoutput1.expsWco[0, 0] = Exp(LSTMoutput1.sWco[1, 1])
-LSTMoutput1.sWco[1, 1] = LearnableParameter
-LSTMoutput1.Wco[1024, 1] = LearnableParameter
-LSTMoutput1.expsWho[0, 0] = Exp(LSTMoutput1.sWho[1, 1])
-LSTMoutput1.sWho[1, 1] = LearnableParameter
-LSTMoutput1.Who[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed173[0, 0] = Plus(LSTMoutput1.Wxox[0, 0], LSTMoutput1.bo[1024, 1])
-LSTMoutput1.bo[1024, 1] = LearnableParameter
-LSTMoutput1.Wxox[0, 0] = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[0, 0])
-LSTMoutput1.unnamed168[0, 0] = Scale(LSTMoutput1.expsWxo[0, 0], featNorm.xNorm[0, 0])
-featNorm.xNorm[0, 0] = PerDimMeanVarNormalization(feashift[0, 0], featNorm.xMean[0, 0], featNorm.xStdDev[0, 0])
-featNorm.xStdDev[0, 0] = InvStdDev(feashift[0, 0])
-featNorm.xMean[0, 0] = Mean(feashift[0, 0])
-feashift[0, 0] = RowSlice(features[363, 1])
-features[363, 1] = InputValue
-LSTMoutput1.expsWxo[0, 0] = Exp(LSTMoutput1.sWxo[1, 1])
-LSTMoutput1.sWxo[1, 1] = LearnableParameter
-LSTMoutput1.Wxo[1024, 33] = LearnableParameter
-LSTMoutput1.expsWmr[0, 0] = Exp(LSTMoutput1.sWmr[1, 1])
-LSTMoutput1.sWmr[1, 1] = LearnableParameter
-LSTMoutput1.Wmr[256, 1024] = LearnableParameter
-LSTMoutput2.expsWxo[0, 0] = Exp(LSTMoutput2.sWxo[1, 1])
-LSTMoutput2.sWxo[1, 1] = LearnableParameter
-LSTMoutput2.Wxo[1024, 256] = LearnableParameter
-LSTMoutput2.expsWmr[0, 0] = Exp(LSTMoutput2.sWmr[1, 1])
-LSTMoutput2.sWmr[1, 1] = LearnableParameter
-LSTMoutput2.Wmr[256, 1024] = LearnableParameter
-LSTMoutput3.expsWxo[0, 0] = Exp(LSTMoutput3.sWxo[1, 1])
-LSTMoutput3.sWxo[1, 1] = LearnableParameter
-LSTMoutput3.Wxo[1024, 256] = LearnableParameter
-LSTMoutput3.expsWmr[0, 0] = Exp(LSTMoutput3.sWmr[1, 1])
-LSTMoutput3.sWmr[1, 1] = LearnableParameter
-LSTMoutput3.Wmr[256, 1024] = LearnableParameter
-expsW[0, 0] = Exp(sW[1, 1])
-sW[1, 1] = LearnableParameter
-W[132, 256] = LearnableParameter
-labels[132, 1] = InputValue
-
-Validating node cr 
-
-Validating --> labels = InputValue
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 1])
-Validating --> featNorm.xMean = Mean(feashift[33, 1])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[0 {W=472446402560, H=14145, C=120}, 0])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256 {W=472446402560, H=14145, C=120}, 1])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256 {W=472446402560, H=14145, C=120}, 1])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256 {W=472446402560, H=14145, C=120}, 1])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256 {W=472446402560, H=14145, C=120}, 1])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[0 {W=472446402560, H=24545, C=120}, 0])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256 {W=472446402560, H=14145, C=120}, 1])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256 {W=472446402560, H=14145, C=120}, 1])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256 {W=472446402560, H=14145, C=120}, 1])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256 {W=472446402560, H=14145, C=120}, 1])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed159[1024, 1])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[0 {W=472446402560, H=31873, C=120}, 0])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256 {W=472446402560, H=31873, C=120}, 1])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256 {W=472446402560, H=31873, C=120}, 1])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256 {W=472446402560, H=31873, C=120}, 1])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256 {W=472446402560, H=31873, C=120}, 1])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[0 {W=472446402560, H=42273, C=120}, 0])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256 {W=472446402560, H=31873, C=120}, 1])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256 {W=472446402560, H=31873, C=120}, 1])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256 {W=472446402560, H=31873, C=120}, 1])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256 {W=472446402560, H=31873, C=120}, 1])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed209[1024, 1])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[0 {W=472446402560, H=51281, C=120}, 0])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256 {W=472446402560, H=51281, C=120}, 1])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256 {W=472446402560, H=51281, C=120}, 1])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256 {W=472446402560, H=51281, C=120}, 1])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256 {W=472446402560, H=51281, C=120}, 1])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[0 {W=472446402560, H=61793, C=120}, 0])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256 {W=472446402560, H=51281, C=120}, 1])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256 {W=472446402560, H=51281, C=120}, 1])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256 {W=472446402560, H=51281, C=120}, 1])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256 {W=472446402560, H=51281, C=120}, 1])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed259[1024, 1])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1])
-Validating --> cr = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[132, 1])
-
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node ScaledLogLikelihood 
-
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 1])
-Validating --> featNorm.xMean = Mean(feashift[33, 1])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed159[1024, 1])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed209[1024, 1])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed259[1024, 1])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1])
-Validating --> labels = InputValue
-Validating --> logPrior.Prior = Mean(labels[132, 1])
-Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1])
-Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, 1], logPrior.LogPrior[132, 1])
-
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node Err 
-
-Validating --> labels = InputValue
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 1])
-Validating --> featNorm.xMean = Mean(feashift[33, 1])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed159[1024, 1])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed209[1024, 1])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed259[1024, 1])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1])
-Validating --> Err = ErrorPrediction(labels[132, 1], LSTMoutputW[132, 1])
-
+Node --> B = LearnableParameter
+Node --> labels = InputValue
+Node --> LSTMoutputW./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].Wmr = LearnableParameter
+Node --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].Wmr = LearnableParameter
+Node --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].Wmr = LearnableParameter
+Node --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> features = InputValue
+Node --> feashift = RowSlice
+Node --> featNorm.meanVector = Mean
+Node --> featNorm.invStdDevVector = InvStdDev
+Node --> featNorm = PerDimMeanVarNormalization
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].dh = PastValue
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].ot.z./*+*/left = Plus
+Node --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].ft.z./*+*/left = Plus
+Node --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].dc = PastValue
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].ft.z = Plus
+Node --> LSTMoutput[1].ft = Sigmoid
+Node --> LSTMoutput[1].bft = ElementTimes
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].it.z./*+*/left = Plus
+Node --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].it.z = Plus
+Node --> LSTMoutput[1].it = Sigmoid
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[1].bit./*.**/right.z = Plus
+Node --> LSTMoutput[1].bit./*.**/right = Tanh
+Node --> LSTMoutput[1].bit = ElementTimes
+Node --> LSTMoutput[1].ct = Plus
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].ot.z = Plus
+Node --> LSTMoutput[1].ot = Sigmoid
+Node --> LSTMoutput[1].mt./*.**/right = Tanh
+Node --> LSTMoutput[1].mt = ElementTimes
+Node --> LSTMoutput[1].output./***/right = Scale
+Node --> LSTMoutput[1].output = Times
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].dh = PastValue
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].ot.z./*+*/left = Plus
+Node --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].ft.z./*+*/left = Plus
+Node --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].dc = PastValue
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].ft.z = Plus
+Node --> LSTMoutput[2].ft = Sigmoid
+Node --> LSTMoutput[2].bft = ElementTimes
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].it.z./*+*/left = Plus
+Node --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].it.z = Plus
+Node --> LSTMoutput[2].it = Sigmoid
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[2].bit./*.**/right.z = Plus
+Node --> LSTMoutput[2].bit./*.**/right = Tanh
+Node --> LSTMoutput[2].bit = ElementTimes
+Node --> LSTMoutput[2].ct = Plus
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].ot.z = Plus
+Node --> LSTMoutput[2].ot = Sigmoid
+Node --> LSTMoutput[2].mt./*.**/right = Tanh
+Node --> LSTMoutput[2].mt = ElementTimes
+Node --> LSTMoutput[2].output./***/right = Scale
+Node --> LSTMoutput[2].output = Times
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].dh = PastValue
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].ot.z./*+*/left = Plus
+Node --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].ft.z./*+*/left = Plus
+Node --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].dc = PastValue
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].ft.z = Plus
+Node --> LSTMoutput[3].ft = Sigmoid
+Node --> LSTMoutput[3].bft = ElementTimes
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].it.z./*+*/left = Plus
+Node --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].it.z = Plus
+Node --> LSTMoutput[3].it = Sigmoid
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[3].bit./*.**/right.z = Plus
+Node --> LSTMoutput[3].bit./*.**/right = Tanh
+Node --> LSTMoutput[3].bit = ElementTimes
+Node --> LSTMoutput[3].ct = Plus
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].ot.z = Plus
+Node --> LSTMoutput[3].ot = Sigmoid
+Node --> LSTMoutput[3].mt./*.**/right = Tanh
+Node --> LSTMoutput[3].mt = ElementTimes
+Node --> LSTMoutput[3].output./***/right = Scale
+Node --> LSTMoutput[3].output = Times
+Node --> LSTMoutputW./*+*/left./***/right = Scale
+Node --> LSTMoutputW./*+*/left = Times
+Node --> LSTMoutputW = Plus
+Node --> Err = ErrorPrediction
+Node --> logPrior.x = Mean
+Node --> logPrior = Log
+Node --> ScaledLogLikelihood = Minus
+Node --> cr = CrossEntropyWithSoftmax
+N9Microsoft3MSR4CNTK18ComputationNetworkE [
+  B : LearnableParameter 132 x 1 ()
+  cr : CrossEntropyWithSoftmax 0 x 0 (
+    labels
+    LSTMoutputW
+  )
+  Err : ErrorPrediction 0 x 0 (
+    labels
+    LSTMoutputW
+  )
+  feashift : RowSlice 0 x 0 (
+    features
+  )
+  featNorm : PerDimMeanVarNormalization 0 x 0 (
+    feashift
+    featNorm.meanVector
+    featNorm.invStdDevVector
+  )
+  featNorm.invStdDevVector : InvStdDev 0 x 0 (
+    feashift
+  )
+  featNorm.meanVector : Mean 0 x 0 (
+    feashift
+  )
+  features : InputValue 363 x 1 ()
+  labels : InputValue 132 x 1 ()
+  logPrior : Log 0 x 0 (
+    logPrior.x
+  )
+  logPrior.x : Mean 0 x 0 (
+    labels
+  )
+  LSTMoutput[1].bft : ElementTimes 0 x 0 (
+    LSTMoutput[1].ft
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].bit : ElementTimes 0 x 0 (
+    LSTMoutput[1].it
+    LSTMoutput[1].bit./*.**/right
+  )
+  LSTMoutput[1].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z
+  )
+  LSTMoutput[1].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ct : Plus 0 x 0 (
+    LSTMoutput[1].bft
+    LSTMoutput[1].bit
+  )
+  LSTMoutput[1].dc : PastValue 1024 x 1 (
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].dh : PastValue 256 x 1 (
+    LSTMoutput[1].output
+  )
+  LSTMoutput[1].ft : Sigmoid 0 x 0 (
+    LSTMoutput[1].ft.z
+  )
+  LSTMoutput[1].ft.z : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left
+    LSTMoutput[1].ft.z./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it : Sigmoid 0 x 0 (
+    LSTMoutput[1].it.z
+  )
+  LSTMoutput[1].it.z : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left
+    LSTMoutput[1].it.z./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left
+    LSTMoutput[1].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].it.z./*+*/right.matrix
+  )
+  LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].mt : ElementTimes 0 x 0 (
+    LSTMoutput[1].ot
+    LSTMoutput[1].mt./*.**/right
+  )
+  LSTMoutput[1].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].ot : Sigmoid 0 x 0 (
+    LSTMoutput[1].ot.z
+  )
+  LSTMoutput[1].ot.z : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left
+    LSTMoutput[1].ot.z./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].output : Times 0 x 0 (
+    LSTMoutput[1].Wmr
+    LSTMoutput[1].output./***/right
+  )
+  LSTMoutput[1].output./***/right : Scale 0 x 0 (
+    LSTMoutput[1].output./***/right.scalarScalingFactor
+    LSTMoutput[1].mt
+  )
+  LSTMoutput[1].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutput[2].bft : ElementTimes 0 x 0 (
+    LSTMoutput[2].ft
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].bit : ElementTimes 0 x 0 (
+    LSTMoutput[2].it
+    LSTMoutput[2].bit./*.**/right
+  )
+  LSTMoutput[2].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z
+  )
+  LSTMoutput[2].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ct : Plus 0 x 0 (
+    LSTMoutput[2].bft
+    LSTMoutput[2].bit
+  )
+  LSTMoutput[2].dc : PastValue 1024 x 1 (
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].dh : PastValue 256 x 1 (
+    LSTMoutput[2].output
+  )
+  LSTMoutput[2].ft : Sigmoid 0 x 0 (
+    LSTMoutput[2].ft.z
+  )
+  LSTMoutput[2].ft.z : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left
+    LSTMoutput[2].ft.z./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it : Sigmoid 0 x 0 (
+    LSTMoutput[2].it.z
+  )
+  LSTMoutput[2].it.z : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left
+    LSTMoutput[2].it.z./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left
+    LSTMoutput[2].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].it.z./*+*/right.matrix
+  )
+  LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].mt : ElementTimes 0 x 0 (
+    LSTMoutput[2].ot
+    LSTMoutput[2].mt./*.**/right
+  )
+  LSTMoutput[2].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].ot : Sigmoid 0 x 0 (
+    LSTMoutput[2].ot.z
+  )
+  LSTMoutput[2].ot.z : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left
+    LSTMoutput[2].ot.z./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].output : Times 0 x 0 (
+    LSTMoutput[2].Wmr
+    LSTMoutput[2].output./***/right
+  )
+  LSTMoutput[2].output./***/right : Scale 0 x 0 (
+    LSTMoutput[2].output./***/right.scalarScalingFactor
+    LSTMoutput[2].mt
+  )
+  LSTMoutput[2].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutput[3].bft : ElementTimes 0 x 0 (
+    LSTMoutput[3].ft
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].bit : ElementTimes 0 x 0 (
+    LSTMoutput[3].it
+    LSTMoutput[3].bit./*.**/right
+  )
+  LSTMoutput[3].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z
+  )
+  LSTMoutput[3].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ct : Plus 0 x 0 (
+    LSTMoutput[3].bft
+    LSTMoutput[3].bit
+  )
+  LSTMoutput[3].dc : PastValue 1024 x 1 (
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].dh : PastValue 256 x 1 (
+    LSTMoutput[3].output
+  )
+  LSTMoutput[3].ft : Sigmoid 0 x 0 (
+    LSTMoutput[3].ft.z
+  )
+  LSTMoutput[3].ft.z : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left
+    LSTMoutput[3].ft.z./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it : Sigmoid 0 x 0 (
+    LSTMoutput[3].it.z
+  )
+  LSTMoutput[3].it.z : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left
+    LSTMoutput[3].it.z./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left
+    LSTMoutput[3].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].it.z./*+*/right.matrix
+  )
+  LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].mt : ElementTimes 0 x 0 (
+    LSTMoutput[3].ot
+    LSTMoutput[3].mt./*.**/right
+  )
+  LSTMoutput[3].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].ot : Sigmoid 0 x 0 (
+    LSTMoutput[3].ot.z
+  )
+  LSTMoutput[3].ot.z : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left
+    LSTMoutput[3].ot.z./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].output : Times 0 x 0 (
+    LSTMoutput[3].Wmr
+    LSTMoutput[3].output./***/right
+  )
+  LSTMoutput[3].output./***/right : Scale 0 x 0 (
+    LSTMoutput[3].output./***/right.scalarScalingFactor
+    LSTMoutput[3].mt
+  )
+  LSTMoutput[3].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutputW : Plus 0 x 0 (
+    LSTMoutputW./*+*/left
+    B
+  )
+  LSTMoutputW./*+*/left : Times 0 x 0 (
+    LSTMoutputW./*+*/left./***/left
+    LSTMoutputW./*+*/left./***/right
+  )
+  LSTMoutputW./*+*/left./***/left : LearnableParameter 132 x 256 ()
+  LSTMoutputW./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutputW./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[3].output
+  )
+  LSTMoutputW./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  ScaledLogLikelihood : Minus 0 x 0 (
+    LSTMoutputW
+    logPrior
+  )
+]
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].mt./*.**/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].mt./*.**/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].mt./*.**/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].mt./*.**/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].mt./*.**/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].mt./*.**/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node cr 
+Validating for node cr. 272 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 1])
-Validating --> featNorm.xMean = Mean(feashift[33, 1])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed159[1024, 1])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 1], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 1])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed209[1024, 1])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 1], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 1])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed259[1024, 1])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 1], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 1])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1])
-Validating --> cr = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[132, 1])
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
 
-Found 6 PreCompute nodes
-	NodeName: featNorm.xMean
-	NodeName: featNorm.xStdDev
-	NodeName: logPrior.Prior
-	NodeName: featNorm.xMean
-	NodeName: featNorm.xStdDev
-	NodeName: logPrior.Prior
+Validating for node cr. 183 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr. 60 nodes to process in pass 3.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+127 out of 272 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> 3 PreCompute nodes found.
+
+	NodeName: featNorm.invStdDevVector
+	NodeName: featNorm.meanVector
+	NodeName: logPrior.x
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node featNorm.xMean 
+Validating for node featNorm.invStdDevVector. 3 nodes to process in pass 1.
 
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 640])
-Validating --> featNorm.xMean = Mean(feashift[33, 640])
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+
+Validating for node featNorm.invStdDevVector, final verification.
+
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+
+1 out of 3 nodes do not share the minibatch layout with the input data.
 
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node featNorm.xStdDev 
+Validating for node featNorm.meanVector. 3 nodes to process in pass 1.
 
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 640])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 640])
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+
+Validating for node featNorm.meanVector, final verification.
+
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+
+1 out of 3 nodes do not share the minibatch layout with the input data.
 
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node logPrior.Prior 
+Validating for node logPrior.x. 2 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> logPrior.Prior = Mean(labels[132, 640])
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
+
+Validating for node logPrior.x. 1 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
+
+Validating for node logPrior.x, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
+
+1 out of 2 nodes do not share the minibatch layout with the input data.
+
+EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+
+Precomputing --> Completed.
 
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.000781  momentum = 0.000000 
+Starting Epoch 1: learning rate per sample = 0.000781  effective momentum = 0.000000 
 minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+ nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
+
+Validating for node Err. 272 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 640]
+Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1]
+
+Validating for node Err. 180 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 640]
+Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1]
+
+Validating for node Err. 6 nodes to process in pass 3.
+
+Validating --> labels = InputValue -> [132, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 640]
+Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1]
+
+Validating for node Err, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 640]
+Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1]
+
+127 out of 272 nodes do not share the minibatch layout with the input data.
+
 
 Starting minibatch loop.
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node Err 
-
-Validating --> labels = InputValue
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 640])
-Validating --> featNorm.xMean = Mean(feashift[33, 640])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 640])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 640], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 640])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 640])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 640], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 640])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 640])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 640], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 640])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 640])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 640], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 640])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 640])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 640])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 640])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 640], LSTMoutput1.Whodh[1024, 640])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 640])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 640])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 640], LSTMoutput1.Whfdh[1024, 640])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 640], LSTMoutput1.Wcfdc[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=472446402560, H=24545, C=120}, 640], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 640])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 640])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 640], LSTMoutput1.Whidh[1024, 640])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 640], LSTMoutput1.Wcidc[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 640])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 640])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 640], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 640], LSTMoutput1.unnamed161[1024, 640])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 640])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=472446402560, H=24545, C=120}, 640], LSTMoutput1.unnamed159[1024, 640])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=472446402560, H=24545, C=120}, 640], LSTMoutput1.bit[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 640], LSTMoutput1.Wcoct[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=472446402560, H=24545, C=120}, 640], LSTMoutput1.unnamed174[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=472446402560, H=24545, C=120}, 640])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 640])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 640], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 640])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 640], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 640])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 640], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 640])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 640])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 640])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 640], LSTMoutput2.Whodh[1024, 640])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 640])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 640])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 640], LSTMoutput2.Whfdh[1024, 640])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 640], LSTMoutput2.Wcfdc[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=472446402560, H=42273, C=120}, 640], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 640])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 640])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 640], LSTMoutput2.Whidh[1024, 640])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 640], LSTMoutput2.Wcidc[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 640])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 640])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 640], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 640], LSTMoutput2.unnamed211[1024, 640])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 640])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=472446402560, H=42273, C=120}, 640], LSTMoutput2.unnamed209[1024, 640])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=472446402560, H=42273, C=120}, 640], LSTMoutput2.bit[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 640], LSTMoutput2.Wcoct[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=472446402560, H=42273, C=120}, 640], LSTMoutput2.unnamed224[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=472446402560, H=42273, C=120}, 640])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 640])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 640], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 640])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 640], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 640])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 640], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 640])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 640])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 640])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 640])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 640], LSTMoutput3.Whodh[1024, 640])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 640])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 640])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 640], LSTMoutput3.Whfdh[1024, 640])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 640], LSTMoutput3.Wcfdc[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=472446402560, H=61793, C=120}, 640], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 640])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 640])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 640], LSTMoutput3.Whidh[1024, 640])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 640], LSTMoutput3.Wcidc[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 640])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 640])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 640], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 640], LSTMoutput3.unnamed261[1024, 640])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 640])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=472446402560, H=61793, C=120}, 640], LSTMoutput3.unnamed259[1024, 640])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=472446402560, H=61793, C=120}, 640], LSTMoutput3.bit[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 640], LSTMoutput3.Wcoct[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=472446402560, H=61793, C=120}, 640], LSTMoutput3.unnamed274[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=472446402560, H=61793, C=120}, 640])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 640])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 640])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 640], b[132, 1])
-Validating --> Err = ErrorPrediction(labels[132, 640], LSTMoutputW[132, 640])
-
- Epoch[ 1 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.80573893; EvalErr[0]PerSample = 0.90281248; TotalTime = 2.72155s; TotalTimePerSample = 0.42524ms; SamplesPerSecond = 2351
- Epoch[ 1 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.59921312; EvalErr[0]PerSample = 0.85390627; TotalTime = 2.71606s; TotalTimePerSample = 0.42438ms; SamplesPerSecond = 2356
- Epoch[ 1 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  5.29241562; EvalErr[0]PerSample = 0.87921876; TotalTime = 2.70903s; TotalTimePerSample = 0.42329ms; SamplesPerSecond = 2362
-Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 4.8512392; EvalErrPerSample = 0.86728519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=8.7031
-Starting Epoch 2: learning rate per sample = 0.000781  momentum = 0.899991 
+ Epoch[ 1 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.80573914; EvalErr[0]PerSample = 0.90281250; TotalTime = 3.70464s; TotalTimePerSample = 0.57885ms; SamplesPerSecond = 1727
+ Epoch[ 1 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.59921326; EvalErr[0]PerSample = 0.85390625; TotalTime = 3.83160s; TotalTimePerSample = 0.59869ms; SamplesPerSecond = 1670
+ Epoch[ 1 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  5.29241577; EvalErr[0]PerSample = 0.87921875; TotalTime = 3.76323s; TotalTimePerSample = 0.58800ms; SamplesPerSecond = 1700
+Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 4.8512392; EvalErrPerSample = 0.86728519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=12.253369
+Starting Epoch 2: learning rate per sample = 0.000781  effective momentum = 0.900000 
 minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20546), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.39003801; EvalErr[0]PerSample = 0.85187501; TotalTime = 2.68673s; TotalTimePerSample = 0.41980ms; SamplesPerSecond = 2382
- Epoch[ 2 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.25110769; EvalErr[0]PerSample = 0.84484375; TotalTime = 2.70369s; TotalTimePerSample = 0.42245ms; SamplesPerSecond = 2367
- Epoch[ 2 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.78259087; EvalErr[0]PerSample = 0.74578124; TotalTime = 2.71281s; TotalTimePerSample = 0.42388ms; SamplesPerSecond = 2359
-Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 4.0735416; EvalErrPerSample = 0.79853517; Ave LearnRatePerSample = 0.0007812500116; EpochTime=8.653936
-Starting Epoch 3: learning rate per sample = 0.000781  momentum = 0.899991 
+ Epoch[ 2 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.39003784; EvalErr[0]PerSample = 0.85187500; TotalTime = 3.68450s; TotalTimePerSample = 0.57570ms; SamplesPerSecond = 1737
+ Epoch[ 2 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.25110718; EvalErr[0]PerSample = 0.84484375; TotalTime = 3.83342s; TotalTimePerSample = 0.59897ms; SamplesPerSecond = 1669
+ Epoch[ 2 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.78258545; EvalErr[0]PerSample = 0.74578125; TotalTime = 3.75383s; TotalTimePerSample = 0.58654ms; SamplesPerSecond = 1704
+Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 4.0735388; EvalErrPerSample = 0.79853517; Ave LearnRatePerSample = 0.0007812500116; EpochTime=12.144325
+Starting Epoch 3: learning rate per sample = 0.000781  effective momentum = 0.900000 
 minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40980), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 3 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.11762667; EvalErr[0]PerSample = 0.83671874; TotalTime = 2.69289s; TotalTimePerSample = 0.42076ms; SamplesPerSecond = 2376
- Epoch[ 3 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.18654823; EvalErr[0]PerSample = 0.86468750; TotalTime = 2.70456s; TotalTimePerSample = 0.42259ms; SamplesPerSecond = 2366
- Epoch[ 3 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.90151119; EvalErr[0]PerSample = 0.83328128; TotalTime = 2.71127s; TotalTimePerSample = 0.42364ms; SamplesPerSecond = 2360
-Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 4.0097828; EvalErrPerSample = 0.83603519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=8.661351
-Starting Epoch 4: learning rate per sample = 0.000781  momentum = 0.899991 
+ Epoch[ 3 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.11762695; EvalErr[0]PerSample = 0.83671875; TotalTime = 3.71971s; TotalTimePerSample = 0.58120ms; SamplesPerSecond = 1720
+ Epoch[ 3 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.18655273; EvalErr[0]PerSample = 0.86468750; TotalTime = 3.81275s; TotalTimePerSample = 0.59574ms; SamplesPerSecond = 1678
+ Epoch[ 3 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.90151001; EvalErr[0]PerSample = 0.83328125; TotalTime = 3.76173s; TotalTimePerSample = 0.58777ms; SamplesPerSecond = 1701
+Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 4.0097837; EvalErrPerSample = 0.83603519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=12.173884
+Starting Epoch 4: learning rate per sample = 0.000781  effective momentum = 0.900000 
 minibatchiterator: epoch 3: frames [61440..81920] (first utterance at frame 61662), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 4 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.06626415; EvalErr[0]PerSample = 0.85124999; TotalTime = 2.68899s; TotalTimePerSample = 0.42015ms; SamplesPerSecond = 2380
- Epoch[ 4 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.13874769; EvalErr[0]PerSample = 0.87437499; TotalTime = 2.70160s; TotalTimePerSample = 0.42213ms; SamplesPerSecond = 2368
- Epoch[ 4 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.94609857; EvalErr[0]PerSample = 0.81968749; TotalTime = 2.71265s; TotalTimePerSample = 0.42385ms; SamplesPerSecond = 2359
-Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 3.9959295; EvalErrPerSample = 0.83603519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=8.661498
+ Epoch[ 4 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.06626434; EvalErr[0]PerSample = 0.85125000; TotalTime = 3.68964s; TotalTimePerSample = 0.57651ms; SamplesPerSecond = 1734
+ Epoch[ 4 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.13874786; EvalErr[0]PerSample = 0.87437500; TotalTime = 3.79460s; TotalTimePerSample = 0.59291ms; SamplesPerSecond = 1686
+ Epoch[ 4 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.94609985; EvalErr[0]PerSample = 0.81968750; TotalTime = 3.77592s; TotalTimePerSample = 0.58999ms; SamplesPerSecond = 1694
+Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 3.9959297; EvalErrPerSample = 0.83603519; Ave LearnRatePerSample = 0.0007812500116; EpochTime=12.162679
+CNTKCommandTrainEnd: speechTrain
 COMPLETED
diff --git a/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt b/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt
index ad2d630f0..3aaebe062 100644
--- a/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt
+++ b/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt
@@ -1,16 +1,16 @@
-=== Running /cygdrive/e/NetScale/CNTK/git_repos/public_master/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data DeviceId=0 NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM
+=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0
 -------------------------------------------------------------------
 Build info: 
 
-		Built time: Sep  8 2015 13:07:27
-		Last modified date: Tue Sep  8 13:07:20 2015
+		Built time: Oct 24 2015 13:33:25
+		Last modified date: Thu Oct 22 16:00:27 2015
 		Built by amitaga on Amitaga-Win-DT3           
-		Build Path: E:\NetScale\CNTK\git_repos\public_master\MachineLearning\CNTK\
+		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
 -------------------------------------------------------------------
-running on Amitaga-Win-DT3 at 2015/09/08 21:08:21
-command line options: 
-configFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM\cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data DeviceId=0 NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM 
+running on Amitaga-Win-DT3 at 2015/10/24 21:55:28
+command line: 
+E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM DeviceId=0 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
@@ -24,9 +24,6 @@ speechTrain=[
     modelPath=$RunDir$/models/cntkSpeech.dnn
     deviceId=$DeviceId$
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=$NDLDir$/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -200,10 +197,10 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ]
-RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu
-DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data
+RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM
 DeviceId=0
-NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
@@ -216,12 +213,9 @@ frameMode=false
 Truncated=true
 speechTrain=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu/models/cntkSpeech.dnn
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn
     deviceId=0
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -241,11 +235,11 @@ speechTrain=[
       features=[
           dim=363
           type=Real
-          scpFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.scp
+          scpFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp
       ]
       labels=[
-          mlfFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.mlf
-          labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/state.list
+          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
           labelDim=132
           labelType=Category
       ]
@@ -395,30 +389,27 @@ feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);
         ScaledLogLikelihood = Minus(LSTMoutputW, logPrior, tag='output')    // sadly we can't say x - y since we want to assign a tag
     ]
 ]
-RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu
-DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data
+RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM
 DeviceId=0
-NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: cntk.config:command=speechTrain
-configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data
+configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\LSTM
+configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
 configparameters: cntk.config:deviceId=0
 configparameters: cntk.config:frameMode=false
-configparameters: cntk.config:NDLDir=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu
+configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu
 configparameters: cntk.config:speechTrain=[
     action=train
-    modelPath=C:\cygwin64\tmp\cntk-test-20150908130820.629582\Speech_LSTM@debug_gpu/models/cntkSpeech.dnn
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn
     deviceId=0
     traceLevel=1
-    NDLNetworkBuilder=[
-        networkDescription=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\LSTM/lstmp-3layer_WithSelfStab.ndl
-    ]    
     SGD=[
         epochSize=20480
         minibatchSize=20
@@ -438,11 +429,11 @@ configparameters: cntk.config:speechTrain=[
       features=[
           dim=363
           type=Real
-          scpFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.scp
+          scpFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp
       ]
       labels=[
-          mlfFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.mlf
-          labelMappingFile=E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/state.list
+          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
           labelDim=132
           labelType=Category
       ]
@@ -597,1789 +588,3393 @@ configparameters: cntk.config:Truncated=true
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: speechTrain 
 precision = float
-NDLBuilder Using GPU 0
-reading script file E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.scp ... 948 entries
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024135527.675673\Speech\LSTM_Truncated@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 4
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 4
+CNTKCommandTrainBegin: speechTrain
+ExperimentalNetworkBuilder using GPU 0
+reading script file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.scp ... 948 entries
 trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
-total 132 state names in state list E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/state.list
-htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\public_master\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
+htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
- nodes in the recurrent loops : 
-LSTMoutput1.unnamed174	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.bit	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.unnamed224	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.bit	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.unnamed274	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.bit	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.unnamed174	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.bit	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.unnamed224	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.bit	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.unnamed274	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.bit	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Printing Gradient Computation Node Order ... 
-
-cr[0, 0] = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[0, 0])
-LSTMoutputW[0, 0] = Plus(unnamed283[0, 0], b[132, 1])
-b[132, 1] = LearnableParameter
-unnamed283[0, 0] = Times(W[132, 256], unnamed284[0, 0])
-unnamed284[0, 0] = Scale(expsW[0, 0], LSTMoutput3.output[0, 0])
-LSTMoutput3.output[0, 0] = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[0, 0])
-LSTMoutput3.unnamed275[0, 0] = Scale(LSTMoutput3.expsWmr[0, 0], LSTMoutput3.mt[0, 0])
-LSTMoutput3.mt[0, 0] = ElementTimes(LSTMoutput3.ot[0, 0], LSTMoutput3.unnamed274[0, 0])
-LSTMoutput3.unnamed274[0, 0] = Tanh(LSTMoutput3.ct[0, 0])
-LSTMoutput3.ot[0, 0] = Sigmoid(LSTMoutput3.unnamed271[0, 0])
-LSTMoutput3.unnamed271[0, 0] = Plus(LSTMoutput3.unnamed272[0, 0], LSTMoutput3.Wcoct[0, 0])
-LSTMoutput3.Wcoct[0, 0] = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[0, 0])
-LSTMoutput3.unnamed270[0, 0] = Scale(LSTMoutput3.expsWco[0, 0], LSTMoutput3.ct[0, 0])
-LSTMoutput3.ct[0, 0] = Plus(LSTMoutput3.bft[0, 0], LSTMoutput3.bit[0, 0])
-LSTMoutput3.bit[0, 0] = ElementTimes(LSTMoutput3.it[0, 0], LSTMoutput3.unnamed259[0, 0])
-LSTMoutput3.unnamed259[0, 0] = Tanh(LSTMoutput3.unnamed260[0, 0])
-LSTMoutput3.unnamed260[0, 0] = Plus(LSTMoutput3.Wxcx[0, 0], LSTMoutput3.unnamed261[0, 0])
-LSTMoutput3.unnamed261[0, 0] = Plus(LSTMoutput3.Whcdh[0, 0], LSTMoutput3.bc[1024, 1])
-LSTMoutput3.Whcdh[0, 0] = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[0, 0])
-LSTMoutput3.unnamed258[0, 0] = Scale(LSTMoutput3.expsWhc[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.it[0, 0] = Sigmoid(LSTMoutput3.unnamed254[0, 0])
-LSTMoutput3.unnamed254[0, 0] = Plus(LSTMoutput3.unnamed255[0, 0], LSTMoutput3.Wcidc[0, 0])
-LSTMoutput3.Wcidc[0, 0] = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[0, 0])
-LSTMoutput3.unnamed253[0, 0] = Scale(LSTMoutput3.expsWci[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.unnamed255[0, 0] = Plus(LSTMoutput3.unnamed256[0, 0], LSTMoutput3.Whidh[0, 0])
-LSTMoutput3.Whidh[0, 0] = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[0, 0])
-LSTMoutput3.unnamed252[0, 0] = Scale(LSTMoutput3.expsWhi[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.bft[0, 0] = ElementTimes(LSTMoutput3.ft[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.ft[0, 0] = Sigmoid(LSTMoutput3.unnamed265[0, 0])
-LSTMoutput3.unnamed265[0, 0] = Plus(LSTMoutput3.unnamed266[0, 0], LSTMoutput3.Wcfdc[0, 0])
-LSTMoutput3.Wcfdc[0, 0] = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[0, 0])
-LSTMoutput3.unnamed264[0, 0] = Scale(LSTMoutput3.expsWcf[0, 0], LSTMoutput3.dc[1024, 1])
-LSTMoutput3.dc[1024, 1] = PastValue(LSTMoutput3.ct[0, 0])
-LSTMoutput3.unnamed266[0, 0] = Plus(LSTMoutput3.unnamed267[0, 0], LSTMoutput3.Whfdh[0, 0])
-LSTMoutput3.Whfdh[0, 0] = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[0, 0])
-LSTMoutput3.unnamed263[0, 0] = Scale(LSTMoutput3.expsWhf[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.unnamed272[0, 0] = Plus(LSTMoutput3.unnamed273[0, 0], LSTMoutput3.Whodh[0, 0])
-LSTMoutput3.Whodh[0, 0] = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[0, 0])
-LSTMoutput3.unnamed269[0, 0] = Scale(LSTMoutput3.expsWho[0, 0], LSTMoutput3.dh[256, 1])
-LSTMoutput3.dh[256, 1] = PastValue(LSTMoutput3.output[0, 0])
-LSTMoutput3.bc[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhc[0, 0] = Exp(LSTMoutput3.sWhc[1, 1])
-LSTMoutput3.sWhc[1, 1] = LearnableParameter
-LSTMoutput3.Whc[1024, 256] = LearnableParameter
-LSTMoutput3.Wxcx[0, 0] = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[0, 0])
-LSTMoutput3.unnamed257[0, 0] = Scale(LSTMoutput3.expsWxc[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxc[0, 0] = Exp(LSTMoutput3.sWxc[1, 1])
-LSTMoutput3.sWxc[1, 1] = LearnableParameter
-LSTMoutput3.Wxc[1024, 256] = LearnableParameter
-LSTMoutput3.expsWci[0, 0] = Exp(LSTMoutput3.sWci[1, 1])
-LSTMoutput3.sWci[1, 1] = LearnableParameter
-LSTMoutput3.Wci[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhi[0, 0] = Exp(LSTMoutput3.sWhi[1, 1])
-LSTMoutput3.sWhi[1, 1] = LearnableParameter
-LSTMoutput3.Whi[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed256[0, 0] = Plus(LSTMoutput3.Wxix[0, 0], LSTMoutput3.bi[1024, 1])
-LSTMoutput3.bi[1024, 1] = LearnableParameter
-LSTMoutput3.Wxix[0, 0] = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[0, 0])
-LSTMoutput3.unnamed251[0, 0] = Scale(LSTMoutput3.expsWxi[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxi[0, 0] = Exp(LSTMoutput3.sWxi[1, 1])
-LSTMoutput3.sWxi[1, 1] = LearnableParameter
-LSTMoutput3.Wxi[1024, 256] = LearnableParameter
-LSTMoutput3.expsWcf[0, 0] = Exp(LSTMoutput3.sWcf[1, 1])
-LSTMoutput3.sWcf[1, 1] = LearnableParameter
-LSTMoutput3.Wcf[1024, 1] = LearnableParameter
-LSTMoutput3.expsWhf[0, 0] = Exp(LSTMoutput3.sWhf[1, 1])
-LSTMoutput3.sWhf[1, 1] = LearnableParameter
-LSTMoutput3.Whf[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed267[0, 0] = Plus(LSTMoutput3.Wxfx[0, 0], LSTMoutput3.bf[1024, 1])
-LSTMoutput3.bf[1024, 1] = LearnableParameter
-LSTMoutput3.Wxfx[0, 0] = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[0, 0])
-LSTMoutput3.unnamed262[0, 0] = Scale(LSTMoutput3.expsWxf[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput3.expsWxf[0, 0] = Exp(LSTMoutput3.sWxf[1, 1])
-LSTMoutput3.sWxf[1, 1] = LearnableParameter
-LSTMoutput3.Wxf[1024, 256] = LearnableParameter
-LSTMoutput3.expsWco[0, 0] = Exp(LSTMoutput3.sWco[1, 1])
-LSTMoutput3.sWco[1, 1] = LearnableParameter
-LSTMoutput3.Wco[1024, 1] = LearnableParameter
-LSTMoutput3.expsWho[0, 0] = Exp(LSTMoutput3.sWho[1, 1])
-LSTMoutput3.sWho[1, 1] = LearnableParameter
-LSTMoutput3.Who[1024, 256] = LearnableParameter
-LSTMoutput3.unnamed273[0, 0] = Plus(LSTMoutput3.Wxox[0, 0], LSTMoutput3.bo[1024, 1])
-LSTMoutput3.bo[1024, 1] = LearnableParameter
-LSTMoutput3.Wxox[0, 0] = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[0, 0])
-LSTMoutput3.unnamed268[0, 0] = Scale(LSTMoutput3.expsWxo[0, 0], LSTMoutput2.output[0, 0])
-LSTMoutput2.output[0, 0] = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[0, 0])
-LSTMoutput2.unnamed225[0, 0] = Scale(LSTMoutput2.expsWmr[0, 0], LSTMoutput2.mt[0, 0])
-LSTMoutput2.mt[0, 0] = ElementTimes(LSTMoutput2.ot[0, 0], LSTMoutput2.unnamed224[0, 0])
-LSTMoutput2.unnamed224[0, 0] = Tanh(LSTMoutput2.ct[0, 0])
-LSTMoutput2.ot[0, 0] = Sigmoid(LSTMoutput2.unnamed221[0, 0])
-LSTMoutput2.unnamed221[0, 0] = Plus(LSTMoutput2.unnamed222[0, 0], LSTMoutput2.Wcoct[0, 0])
-LSTMoutput2.Wcoct[0, 0] = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[0, 0])
-LSTMoutput2.unnamed220[0, 0] = Scale(LSTMoutput2.expsWco[0, 0], LSTMoutput2.ct[0, 0])
-LSTMoutput2.ct[0, 0] = Plus(LSTMoutput2.bft[0, 0], LSTMoutput2.bit[0, 0])
-LSTMoutput2.bit[0, 0] = ElementTimes(LSTMoutput2.it[0, 0], LSTMoutput2.unnamed209[0, 0])
-LSTMoutput2.unnamed209[0, 0] = Tanh(LSTMoutput2.unnamed210[0, 0])
-LSTMoutput2.unnamed210[0, 0] = Plus(LSTMoutput2.Wxcx[0, 0], LSTMoutput2.unnamed211[0, 0])
-LSTMoutput2.unnamed211[0, 0] = Plus(LSTMoutput2.Whcdh[0, 0], LSTMoutput2.bc[1024, 1])
-LSTMoutput2.Whcdh[0, 0] = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[0, 0])
-LSTMoutput2.unnamed208[0, 0] = Scale(LSTMoutput2.expsWhc[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.it[0, 0] = Sigmoid(LSTMoutput2.unnamed204[0, 0])
-LSTMoutput2.unnamed204[0, 0] = Plus(LSTMoutput2.unnamed205[0, 0], LSTMoutput2.Wcidc[0, 0])
-LSTMoutput2.Wcidc[0, 0] = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[0, 0])
-LSTMoutput2.unnamed203[0, 0] = Scale(LSTMoutput2.expsWci[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.unnamed205[0, 0] = Plus(LSTMoutput2.unnamed206[0, 0], LSTMoutput2.Whidh[0, 0])
-LSTMoutput2.Whidh[0, 0] = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[0, 0])
-LSTMoutput2.unnamed202[0, 0] = Scale(LSTMoutput2.expsWhi[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.bft[0, 0] = ElementTimes(LSTMoutput2.ft[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.ft[0, 0] = Sigmoid(LSTMoutput2.unnamed215[0, 0])
-LSTMoutput2.unnamed215[0, 0] = Plus(LSTMoutput2.unnamed216[0, 0], LSTMoutput2.Wcfdc[0, 0])
-LSTMoutput2.Wcfdc[0, 0] = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[0, 0])
-LSTMoutput2.unnamed214[0, 0] = Scale(LSTMoutput2.expsWcf[0, 0], LSTMoutput2.dc[1024, 1])
-LSTMoutput2.dc[1024, 1] = PastValue(LSTMoutput2.ct[0, 0])
-LSTMoutput2.unnamed216[0, 0] = Plus(LSTMoutput2.unnamed217[0, 0], LSTMoutput2.Whfdh[0, 0])
-LSTMoutput2.Whfdh[0, 0] = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[0, 0])
-LSTMoutput2.unnamed213[0, 0] = Scale(LSTMoutput2.expsWhf[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.unnamed222[0, 0] = Plus(LSTMoutput2.unnamed223[0, 0], LSTMoutput2.Whodh[0, 0])
-LSTMoutput2.Whodh[0, 0] = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[0, 0])
-LSTMoutput2.unnamed219[0, 0] = Scale(LSTMoutput2.expsWho[0, 0], LSTMoutput2.dh[256, 1])
-LSTMoutput2.dh[256, 1] = PastValue(LSTMoutput2.output[0, 0])
-LSTMoutput2.bc[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhc[0, 0] = Exp(LSTMoutput2.sWhc[1, 1])
-LSTMoutput2.sWhc[1, 1] = LearnableParameter
-LSTMoutput2.Whc[1024, 256] = LearnableParameter
-LSTMoutput2.Wxcx[0, 0] = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[0, 0])
-LSTMoutput2.unnamed207[0, 0] = Scale(LSTMoutput2.expsWxc[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxc[0, 0] = Exp(LSTMoutput2.sWxc[1, 1])
-LSTMoutput2.sWxc[1, 1] = LearnableParameter
-LSTMoutput2.Wxc[1024, 256] = LearnableParameter
-LSTMoutput2.expsWci[0, 0] = Exp(LSTMoutput2.sWci[1, 1])
-LSTMoutput2.sWci[1, 1] = LearnableParameter
-LSTMoutput2.Wci[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhi[0, 0] = Exp(LSTMoutput2.sWhi[1, 1])
-LSTMoutput2.sWhi[1, 1] = LearnableParameter
-LSTMoutput2.Whi[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed206[0, 0] = Plus(LSTMoutput2.Wxix[0, 0], LSTMoutput2.bi[1024, 1])
-LSTMoutput2.bi[1024, 1] = LearnableParameter
-LSTMoutput2.Wxix[0, 0] = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[0, 0])
-LSTMoutput2.unnamed201[0, 0] = Scale(LSTMoutput2.expsWxi[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxi[0, 0] = Exp(LSTMoutput2.sWxi[1, 1])
-LSTMoutput2.sWxi[1, 1] = LearnableParameter
-LSTMoutput2.Wxi[1024, 256] = LearnableParameter
-LSTMoutput2.expsWcf[0, 0] = Exp(LSTMoutput2.sWcf[1, 1])
-LSTMoutput2.sWcf[1, 1] = LearnableParameter
-LSTMoutput2.Wcf[1024, 1] = LearnableParameter
-LSTMoutput2.expsWhf[0, 0] = Exp(LSTMoutput2.sWhf[1, 1])
-LSTMoutput2.sWhf[1, 1] = LearnableParameter
-LSTMoutput2.Whf[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed217[0, 0] = Plus(LSTMoutput2.Wxfx[0, 0], LSTMoutput2.bf[1024, 1])
-LSTMoutput2.bf[1024, 1] = LearnableParameter
-LSTMoutput2.Wxfx[0, 0] = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[0, 0])
-LSTMoutput2.unnamed212[0, 0] = Scale(LSTMoutput2.expsWxf[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput2.expsWxf[0, 0] = Exp(LSTMoutput2.sWxf[1, 1])
-LSTMoutput2.sWxf[1, 1] = LearnableParameter
-LSTMoutput2.Wxf[1024, 256] = LearnableParameter
-LSTMoutput2.expsWco[0, 0] = Exp(LSTMoutput2.sWco[1, 1])
-LSTMoutput2.sWco[1, 1] = LearnableParameter
-LSTMoutput2.Wco[1024, 1] = LearnableParameter
-LSTMoutput2.expsWho[0, 0] = Exp(LSTMoutput2.sWho[1, 1])
-LSTMoutput2.sWho[1, 1] = LearnableParameter
-LSTMoutput2.Who[1024, 256] = LearnableParameter
-LSTMoutput2.unnamed223[0, 0] = Plus(LSTMoutput2.Wxox[0, 0], LSTMoutput2.bo[1024, 1])
-LSTMoutput2.bo[1024, 1] = LearnableParameter
-LSTMoutput2.Wxox[0, 0] = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[0, 0])
-LSTMoutput2.unnamed218[0, 0] = Scale(LSTMoutput2.expsWxo[0, 0], LSTMoutput1.output[0, 0])
-LSTMoutput1.output[0, 0] = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[0, 0])
-LSTMoutput1.unnamed175[0, 0] = Scale(LSTMoutput1.expsWmr[0, 0], LSTMoutput1.mt[0, 0])
-LSTMoutput1.mt[0, 0] = ElementTimes(LSTMoutput1.ot[0, 0], LSTMoutput1.unnamed174[0, 0])
-LSTMoutput1.unnamed174[0, 0] = Tanh(LSTMoutput1.ct[0, 0])
-LSTMoutput1.ot[0, 0] = Sigmoid(LSTMoutput1.unnamed171[0, 0])
-LSTMoutput1.unnamed171[0, 0] = Plus(LSTMoutput1.unnamed172[0, 0], LSTMoutput1.Wcoct[0, 0])
-LSTMoutput1.Wcoct[0, 0] = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[0, 0])
-LSTMoutput1.unnamed170[0, 0] = Scale(LSTMoutput1.expsWco[0, 0], LSTMoutput1.ct[0, 0])
-LSTMoutput1.ct[0, 0] = Plus(LSTMoutput1.bft[0, 0], LSTMoutput1.bit[0, 0])
-LSTMoutput1.bit[0, 0] = ElementTimes(LSTMoutput1.it[0, 0], LSTMoutput1.unnamed159[0, 0])
-LSTMoutput1.unnamed159[0, 0] = Tanh(LSTMoutput1.unnamed160[0, 0])
-LSTMoutput1.unnamed160[0, 0] = Plus(LSTMoutput1.Wxcx[0, 0], LSTMoutput1.unnamed161[0, 0])
-LSTMoutput1.unnamed161[0, 0] = Plus(LSTMoutput1.Whcdh[0, 0], LSTMoutput1.bc[1024, 1])
-LSTMoutput1.Whcdh[0, 0] = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[0, 0])
-LSTMoutput1.unnamed158[0, 0] = Scale(LSTMoutput1.expsWhc[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.it[0, 0] = Sigmoid(LSTMoutput1.unnamed154[0, 0])
-LSTMoutput1.unnamed154[0, 0] = Plus(LSTMoutput1.unnamed155[0, 0], LSTMoutput1.Wcidc[0, 0])
-LSTMoutput1.Wcidc[0, 0] = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[0, 0])
-LSTMoutput1.unnamed153[0, 0] = Scale(LSTMoutput1.expsWci[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.unnamed155[0, 0] = Plus(LSTMoutput1.unnamed156[0, 0], LSTMoutput1.Whidh[0, 0])
-LSTMoutput1.Whidh[0, 0] = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[0, 0])
-LSTMoutput1.unnamed152[0, 0] = Scale(LSTMoutput1.expsWhi[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.bft[0, 0] = ElementTimes(LSTMoutput1.ft[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.ft[0, 0] = Sigmoid(LSTMoutput1.unnamed165[0, 0])
-LSTMoutput1.unnamed165[0, 0] = Plus(LSTMoutput1.unnamed166[0, 0], LSTMoutput1.Wcfdc[0, 0])
-LSTMoutput1.Wcfdc[0, 0] = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[0, 0])
-LSTMoutput1.unnamed164[0, 0] = Scale(LSTMoutput1.expsWcf[0, 0], LSTMoutput1.dc[1024, 1])
-LSTMoutput1.dc[1024, 1] = PastValue(LSTMoutput1.ct[0, 0])
-LSTMoutput1.unnamed166[0, 0] = Plus(LSTMoutput1.unnamed167[0, 0], LSTMoutput1.Whfdh[0, 0])
-LSTMoutput1.Whfdh[0, 0] = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[0, 0])
-LSTMoutput1.unnamed163[0, 0] = Scale(LSTMoutput1.expsWhf[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.unnamed172[0, 0] = Plus(LSTMoutput1.unnamed173[0, 0], LSTMoutput1.Whodh[0, 0])
-LSTMoutput1.Whodh[0, 0] = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[0, 0])
-LSTMoutput1.unnamed169[0, 0] = Scale(LSTMoutput1.expsWho[0, 0], LSTMoutput1.dh[256, 1])
-LSTMoutput1.dh[256, 1] = PastValue(LSTMoutput1.output[0, 0])
-LSTMoutput1.bc[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhc[0, 0] = Exp(LSTMoutput1.sWhc[1, 1])
-LSTMoutput1.sWhc[1, 1] = LearnableParameter
-LSTMoutput1.Whc[1024, 256] = LearnableParameter
-LSTMoutput1.Wxcx[0, 0] = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[0, 0])
-LSTMoutput1.unnamed157[0, 0] = Scale(LSTMoutput1.expsWxc[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxc[0, 0] = Exp(LSTMoutput1.sWxc[1, 1])
-LSTMoutput1.sWxc[1, 1] = LearnableParameter
-LSTMoutput1.Wxc[1024, 33] = LearnableParameter
-LSTMoutput1.expsWci[0, 0] = Exp(LSTMoutput1.sWci[1, 1])
-LSTMoutput1.sWci[1, 1] = LearnableParameter
-LSTMoutput1.Wci[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhi[0, 0] = Exp(LSTMoutput1.sWhi[1, 1])
-LSTMoutput1.sWhi[1, 1] = LearnableParameter
-LSTMoutput1.Whi[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed156[0, 0] = Plus(LSTMoutput1.Wxix[0, 0], LSTMoutput1.bi[1024, 1])
-LSTMoutput1.bi[1024, 1] = LearnableParameter
-LSTMoutput1.Wxix[0, 0] = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[0, 0])
-LSTMoutput1.unnamed151[0, 0] = Scale(LSTMoutput1.expsWxi[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxi[0, 0] = Exp(LSTMoutput1.sWxi[1, 1])
-LSTMoutput1.sWxi[1, 1] = LearnableParameter
-LSTMoutput1.Wxi[1024, 33] = LearnableParameter
-LSTMoutput1.expsWcf[0, 0] = Exp(LSTMoutput1.sWcf[1, 1])
-LSTMoutput1.sWcf[1, 1] = LearnableParameter
-LSTMoutput1.Wcf[1024, 1] = LearnableParameter
-LSTMoutput1.expsWhf[0, 0] = Exp(LSTMoutput1.sWhf[1, 1])
-LSTMoutput1.sWhf[1, 1] = LearnableParameter
-LSTMoutput1.Whf[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed167[0, 0] = Plus(LSTMoutput1.Wxfx[0, 0], LSTMoutput1.bf[1024, 1])
-LSTMoutput1.bf[1024, 1] = LearnableParameter
-LSTMoutput1.Wxfx[0, 0] = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[0, 0])
-LSTMoutput1.unnamed162[0, 0] = Scale(LSTMoutput1.expsWxf[0, 0], featNorm.xNorm[0, 0])
-LSTMoutput1.expsWxf[0, 0] = Exp(LSTMoutput1.sWxf[1, 1])
-LSTMoutput1.sWxf[1, 1] = LearnableParameter
-LSTMoutput1.Wxf[1024, 33] = LearnableParameter
-LSTMoutput1.expsWco[0, 0] = Exp(LSTMoutput1.sWco[1, 1])
-LSTMoutput1.sWco[1, 1] = LearnableParameter
-LSTMoutput1.Wco[1024, 1] = LearnableParameter
-LSTMoutput1.expsWho[0, 0] = Exp(LSTMoutput1.sWho[1, 1])
-LSTMoutput1.sWho[1, 1] = LearnableParameter
-LSTMoutput1.Who[1024, 256] = LearnableParameter
-LSTMoutput1.unnamed173[0, 0] = Plus(LSTMoutput1.Wxox[0, 0], LSTMoutput1.bo[1024, 1])
-LSTMoutput1.bo[1024, 1] = LearnableParameter
-LSTMoutput1.Wxox[0, 0] = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[0, 0])
-LSTMoutput1.unnamed168[0, 0] = Scale(LSTMoutput1.expsWxo[0, 0], featNorm.xNorm[0, 0])
-featNorm.xNorm[0, 0] = PerDimMeanVarNormalization(feashift[0, 0], featNorm.xMean[0, 0], featNorm.xStdDev[0, 0])
-featNorm.xStdDev[0, 0] = InvStdDev(feashift[0, 0])
-featNorm.xMean[0, 0] = Mean(feashift[0, 0])
-feashift[0, 0] = RowSlice(features[363, 1])
-features[363, 1] = InputValue
-LSTMoutput1.expsWxo[0, 0] = Exp(LSTMoutput1.sWxo[1, 1])
-LSTMoutput1.sWxo[1, 1] = LearnableParameter
-LSTMoutput1.Wxo[1024, 33] = LearnableParameter
-LSTMoutput1.expsWmr[0, 0] = Exp(LSTMoutput1.sWmr[1, 1])
-LSTMoutput1.sWmr[1, 1] = LearnableParameter
-LSTMoutput1.Wmr[256, 1024] = LearnableParameter
-LSTMoutput2.expsWxo[0, 0] = Exp(LSTMoutput2.sWxo[1, 1])
-LSTMoutput2.sWxo[1, 1] = LearnableParameter
-LSTMoutput2.Wxo[1024, 256] = LearnableParameter
-LSTMoutput2.expsWmr[0, 0] = Exp(LSTMoutput2.sWmr[1, 1])
-LSTMoutput2.sWmr[1, 1] = LearnableParameter
-LSTMoutput2.Wmr[256, 1024] = LearnableParameter
-LSTMoutput3.expsWxo[0, 0] = Exp(LSTMoutput3.sWxo[1, 1])
-LSTMoutput3.sWxo[1, 1] = LearnableParameter
-LSTMoutput3.Wxo[1024, 256] = LearnableParameter
-LSTMoutput3.expsWmr[0, 0] = Exp(LSTMoutput3.sWmr[1, 1])
-LSTMoutput3.sWmr[1, 1] = LearnableParameter
-LSTMoutput3.Wmr[256, 1024] = LearnableParameter
-expsW[0, 0] = Exp(sW[1, 1])
-sW[1, 1] = LearnableParameter
-W[132, 256] = LearnableParameter
-labels[132, 1] = InputValue
-
-Validating node cr 
-
-Validating --> labels = InputValue
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 1])
-Validating --> featNorm.xMean = Mean(feashift[33, 1])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[0 {W=3452816845, H=3452816845, C=3452816845}, 0])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[0 {W=3452816845, H=3452816845, C=3452816845}, 0])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed159[1024, 1])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[0 {W=3452816845, H=3452816845, C=3452816845}, 0])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[0 {W=3452816845, H=3452816845, C=3452816845}, 0])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed209[1024, 1])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[0 {W=3452816845, H=3452816845, C=3452816845}, 0])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[0 {W=3452816845, H=3452816845, C=3452816845}, 0])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed259[1024, 1])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1])
-Validating --> cr = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[132, 1])
-
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node ScaledLogLikelihood 
-
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 1])
-Validating --> featNorm.xMean = Mean(feashift[33, 1])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed159[1024, 1])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed209[1024, 1])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed259[1024, 1])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1])
-Validating --> labels = InputValue
-Validating --> logPrior.Prior = Mean(labels[132, 1])
-Validating --> logPrior.LogPrior = Log(logPrior.Prior[132, 1])
-Validating --> ScaledLogLikelihood = Minus(LSTMoutputW[132, 1], logPrior.LogPrior[132, 1])
-
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node Err 
-
-Validating --> labels = InputValue
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 1])
-Validating --> featNorm.xMean = Mean(feashift[33, 1])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed159[1024, 1])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed209[1024, 1])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed259[1024, 1])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1])
-Validating --> Err = ErrorPrediction(labels[132, 1], LSTMoutputW[132, 1])
-
+Node --> B = LearnableParameter
+Node --> labels = InputValue
+Node --> LSTMoutputW./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].Wmr = LearnableParameter
+Node --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].Wmr = LearnableParameter
+Node --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].Wmr = LearnableParameter
+Node --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> features = InputValue
+Node --> feashift = RowSlice
+Node --> featNorm.meanVector = Mean
+Node --> featNorm.invStdDevVector = InvStdDev
+Node --> featNorm = PerDimMeanVarNormalization
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].dh = PastValue
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].ot.z./*+*/left = Plus
+Node --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].ft.z./*+*/left = Plus
+Node --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].dc = PastValue
+Node --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].ft.z = Plus
+Node --> LSTMoutput[1].ft = Sigmoid
+Node --> LSTMoutput[1].bft = ElementTimes
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[1].it.z./*+*/left = Plus
+Node --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].it.z = Plus
+Node --> LSTMoutput[1].it = Sigmoid
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[1].bit./*.**/right.z = Plus
+Node --> LSTMoutput[1].bit./*.**/right = Tanh
+Node --> LSTMoutput[1].bit = ElementTimes
+Node --> LSTMoutput[1].ct = Plus
+Node --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[1].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[1].ot.z = Plus
+Node --> LSTMoutput[1].ot = Sigmoid
+Node --> LSTMoutput[1].mt./*.**/right = Tanh
+Node --> LSTMoutput[1].mt = ElementTimes
+Node --> LSTMoutput[1].output./***/right = Scale
+Node --> LSTMoutput[1].output = Times
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].dh = PastValue
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].ot.z./*+*/left = Plus
+Node --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].ft.z./*+*/left = Plus
+Node --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].dc = PastValue
+Node --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].ft.z = Plus
+Node --> LSTMoutput[2].ft = Sigmoid
+Node --> LSTMoutput[2].bft = ElementTimes
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[2].it.z./*+*/left = Plus
+Node --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].it.z = Plus
+Node --> LSTMoutput[2].it = Sigmoid
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[2].bit./*.**/right.z = Plus
+Node --> LSTMoutput[2].bit./*.**/right = Tanh
+Node --> LSTMoutput[2].bit = ElementTimes
+Node --> LSTMoutput[2].ct = Plus
+Node --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[2].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[2].ot.z = Plus
+Node --> LSTMoutput[2].ot = Sigmoid
+Node --> LSTMoutput[2].mt./*.**/right = Tanh
+Node --> LSTMoutput[2].mt = ElementTimes
+Node --> LSTMoutput[2].output./***/right = Scale
+Node --> LSTMoutput[2].output = Times
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].dh = PastValue
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].ot.z./*+*/left = Plus
+Node --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].ft.z./*+*/left = Plus
+Node --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].dc = PastValue
+Node --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].ft.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].ft.z = Plus
+Node --> LSTMoutput[3].ft = Sigmoid
+Node --> LSTMoutput[3].bft = ElementTimes
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale
+Node --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times
+Node --> LSTMoutput[3].it.z./*+*/left = Plus
+Node --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].it.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].it.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].it.z = Plus
+Node --> LSTMoutput[3].it = Sigmoid
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter
+Node --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus
+Node --> LSTMoutput[3].bit./*.**/right.z = Plus
+Node --> LSTMoutput[3].bit./*.**/right = Tanh
+Node --> LSTMoutput[3].bit = ElementTimes
+Node --> LSTMoutput[3].ct = Plus
+Node --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale
+Node --> LSTMoutput[3].ot.z./*+*/right = DiagTimes
+Node --> LSTMoutput[3].ot.z = Plus
+Node --> LSTMoutput[3].ot = Sigmoid
+Node --> LSTMoutput[3].mt./*.**/right = Tanh
+Node --> LSTMoutput[3].mt = ElementTimes
+Node --> LSTMoutput[3].output./***/right = Scale
+Node --> LSTMoutput[3].output = Times
+Node --> LSTMoutputW./*+*/left./***/right = Scale
+Node --> LSTMoutputW./*+*/left = Times
+Node --> LSTMoutputW = Plus
+Node --> Err = ErrorPrediction
+Node --> logPrior.x = Mean
+Node --> logPrior = Log
+Node --> ScaledLogLikelihood = Minus
+Node --> cr = CrossEntropyWithSoftmax
+class Microsoft::MSR::CNTK::ComputationNetwork [
+  B : LearnableParameter 132 x 1 ()
+  cr : CrossEntropyWithSoftmax 0 x 0 (
+    labels
+    LSTMoutputW
+  )
+  Err : ErrorPrediction 0 x 0 (
+    labels
+    LSTMoutputW
+  )
+  feashift : RowSlice 0 x 0 (
+    features
+  )
+  featNorm : PerDimMeanVarNormalization 0 x 0 (
+    feashift
+    featNorm.meanVector
+    featNorm.invStdDevVector
+  )
+  featNorm.invStdDevVector : InvStdDev 0 x 0 (
+    feashift
+  )
+  featNorm.meanVector : Mean 0 x 0 (
+    feashift
+  )
+  features : InputValue 363 x 1 ()
+  labels : InputValue 132 x 1 ()
+  logPrior : Log 0 x 0 (
+    logPrior.x
+  )
+  logPrior.x : Mean 0 x 0 (
+    labels
+  )
+  LSTMoutput[1].bft : ElementTimes 0 x 0 (
+    LSTMoutput[1].ft
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].bit : ElementTimes 0 x 0 (
+    LSTMoutput[1].it
+    LSTMoutput[1].bit./*.**/right
+  )
+  LSTMoutput[1].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z
+  )
+  LSTMoutput[1].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ct : Plus 0 x 0 (
+    LSTMoutput[1].bft
+    LSTMoutput[1].bit
+  )
+  LSTMoutput[1].dc : PastValue 1024 x 1 (
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].dh : PastValue 256 x 1 (
+    LSTMoutput[1].output
+  )
+  LSTMoutput[1].ft : Sigmoid 0 x 0 (
+    LSTMoutput[1].ft.z
+  )
+  LSTMoutput[1].ft.z : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left
+    LSTMoutput[1].ft.z./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it : Sigmoid 0 x 0 (
+    LSTMoutput[1].it.z
+  )
+  LSTMoutput[1].it.z : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left
+    LSTMoutput[1].it.z./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left
+    LSTMoutput[1].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].it.z./*+*/right.matrix
+  )
+  LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].dc
+  )
+  LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].mt : ElementTimes 0 x 0 (
+    LSTMoutput[1].ot
+    LSTMoutput[1].mt./*.**/right
+  )
+  LSTMoutput[1].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].ot : Sigmoid 0 x 0 (
+    LSTMoutput[1].ot.z
+  )
+  LSTMoutput[1].ot.z : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left
+    LSTMoutput[1].ot.z./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 33 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    featNorm
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[1].dh
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[1].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[1].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[1].ct
+  )
+  LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].output : Times 0 x 0 (
+    LSTMoutput[1].Wmr
+    LSTMoutput[1].output./***/right
+  )
+  LSTMoutput[1].output./***/right : Scale 0 x 0 (
+    LSTMoutput[1].output./***/right.scalarScalingFactor
+    LSTMoutput[1].mt
+  )
+  LSTMoutput[1].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[1].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[1].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[1].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutput[2].bft : ElementTimes 0 x 0 (
+    LSTMoutput[2].ft
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].bit : ElementTimes 0 x 0 (
+    LSTMoutput[2].it
+    LSTMoutput[2].bit./*.**/right
+  )
+  LSTMoutput[2].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z
+  )
+  LSTMoutput[2].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ct : Plus 0 x 0 (
+    LSTMoutput[2].bft
+    LSTMoutput[2].bit
+  )
+  LSTMoutput[2].dc : PastValue 1024 x 1 (
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].dh : PastValue 256 x 1 (
+    LSTMoutput[2].output
+  )
+  LSTMoutput[2].ft : Sigmoid 0 x 0 (
+    LSTMoutput[2].ft.z
+  )
+  LSTMoutput[2].ft.z : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left
+    LSTMoutput[2].ft.z./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it : Sigmoid 0 x 0 (
+    LSTMoutput[2].it.z
+  )
+  LSTMoutput[2].it.z : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left
+    LSTMoutput[2].it.z./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left
+    LSTMoutput[2].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].it.z./*+*/right.matrix
+  )
+  LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].dc
+  )
+  LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].mt : ElementTimes 0 x 0 (
+    LSTMoutput[2].ot
+    LSTMoutput[2].mt./*.**/right
+  )
+  LSTMoutput[2].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].ot : Sigmoid 0 x 0 (
+    LSTMoutput[2].ot.z
+  )
+  LSTMoutput[2].ot.z : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left
+    LSTMoutput[2].ot.z./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[1].output
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[2].dh
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[2].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[2].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[2].ct
+  )
+  LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].output : Times 0 x 0 (
+    LSTMoutput[2].Wmr
+    LSTMoutput[2].output./***/right
+  )
+  LSTMoutput[2].output./***/right : Scale 0 x 0 (
+    LSTMoutput[2].output./***/right.scalarScalingFactor
+    LSTMoutput[2].mt
+  )
+  LSTMoutput[2].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[2].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[2].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[2].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutput[3].bft : ElementTimes 0 x 0 (
+    LSTMoutput[3].ft
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].bit : ElementTimes 0 x 0 (
+    LSTMoutput[3].it
+    LSTMoutput[3].bit./*.**/right
+  )
+  LSTMoutput[3].bit./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z
+  )
+  LSTMoutput[3].bit./*.**/right.z : Plus 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right : Plus 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ct : Plus 0 x 0 (
+    LSTMoutput[3].bft
+    LSTMoutput[3].bit
+  )
+  LSTMoutput[3].dc : PastValue 1024 x 1 (
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].dh : PastValue 256 x 1 (
+    LSTMoutput[3].output
+  )
+  LSTMoutput[3].ft : Sigmoid 0 x 0 (
+    LSTMoutput[3].ft.z
+  )
+  LSTMoutput[3].ft.z : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left
+    LSTMoutput[3].ft.z./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ft.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].ft.z./*+*/right.matrix
+  )
+  LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ft.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it : Sigmoid 0 x 0 (
+    LSTMoutput[3].it.z
+  )
+  LSTMoutput[3].it.z : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left
+    LSTMoutput[3].it.z./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left
+    LSTMoutput[3].it.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].it.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].it.z./*+*/right.matrix
+  )
+  LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].it.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].dc
+  )
+  LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].mt : ElementTimes 0 x 0 (
+    LSTMoutput[3].ot
+    LSTMoutput[3].mt./*.**/right
+  )
+  LSTMoutput[3].mt./*.**/right : Tanh 0 x 0 (
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].ot : Sigmoid 0 x 0 (
+    LSTMoutput[3].ot.z
+  )
+  LSTMoutput[3].ot.z : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left
+    LSTMoutput[3].ot.z./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left : Plus 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left : Times 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[2].output
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/right : Times 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left : LearnableParameter 1024 x 256 ()
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor
+    LSTMoutput[3].dh
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].ot.z./*+*/right : DiagTimes 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector
+    LSTMoutput[3].ot.z./*+*/right.matrix
+  )
+  LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector : LearnableParameter 1024 x 1 ()
+  LSTMoutput[3].ot.z./*+*/right.matrix : Scale 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor
+    LSTMoutput[3].ct
+  )
+  LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x
+  )
+  LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].output : Times 0 x 0 (
+    LSTMoutput[3].Wmr
+    LSTMoutput[3].output./***/right
+  )
+  LSTMoutput[3].output./***/right : Scale 0 x 0 (
+    LSTMoutput[3].output./***/right.scalarScalingFactor
+    LSTMoutput[3].mt
+  )
+  LSTMoutput[3].output./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutput[3].output./***/right.scalarScalingFactor.x
+  )
+  LSTMoutput[3].output./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  LSTMoutput[3].Wmr : LearnableParameter 256 x 1024 ()
+  LSTMoutputW : Plus 0 x 0 (
+    LSTMoutputW./*+*/left
+    B
+  )
+  LSTMoutputW./*+*/left : Times 0 x 0 (
+    LSTMoutputW./*+*/left./***/left
+    LSTMoutputW./*+*/left./***/right
+  )
+  LSTMoutputW./*+*/left./***/left : LearnableParameter 132 x 256 ()
+  LSTMoutputW./*+*/left./***/right : Scale 0 x 0 (
+    LSTMoutputW./*+*/left./***/right.scalarScalingFactor
+    LSTMoutput[3].output
+  )
+  LSTMoutputW./*+*/left./***/right.scalarScalingFactor : Exp 0 x 0 (
+    LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x
+  )
+  LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x : LearnableParameter 1 x 1 ()
+  ScaledLogLikelihood : Minus 0 x 0 (
+    LSTMoutputW
+    logPrior
+  )
+]
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].mt./*.**/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].mt./*.**/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].mt./*.**/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].mt./*.**/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].mt./*.**/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].mt./*.**/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node cr 
+Validating for node cr. 272 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 1])
-Validating --> featNorm.xMean = Mean(feashift[33, 1])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 1])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 1], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 1])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 1], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 1])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 1], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 1])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 1], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 1])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 1])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 1])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 1], LSTMoutput1.Whodh[1024, 1])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 1])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 1], LSTMoutput1.Whfdh[1024, 1])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 1], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 1])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 1], LSTMoutput1.Whidh[1024, 1])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 1], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 1])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 1])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 1], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 1], LSTMoutput1.unnamed161[1024, 1])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 1])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed159[1024, 1])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 1], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 1])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 1], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 1])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 1], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 1])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 1], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 1])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 1])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 1])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 1], LSTMoutput2.Whodh[1024, 1])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 1])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 1], LSTMoutput2.Whfdh[1024, 1])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 1], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 1])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 1], LSTMoutput2.Whidh[1024, 1])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 1], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 1])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 1])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 1], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 1], LSTMoutput2.unnamed211[1024, 1])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 1])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed209[1024, 1])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 1], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 1])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 1], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 1])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 1], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 1])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 1], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 1])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 1])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 1])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 1])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 1], LSTMoutput3.Whodh[1024, 1])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 1])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 1], LSTMoutput3.Whfdh[1024, 1])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 1], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 1])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 1], LSTMoutput3.Whidh[1024, 1])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 1], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 1])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 1])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 1], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 1], LSTMoutput3.unnamed261[1024, 1])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 1])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed259[1024, 1])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 1], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 1], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 1])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 1])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 1])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 1], b[132, 1])
-Validating --> cr = CrossEntropyWithSoftmax(labels[132, 1], LSTMoutputW[132, 1])
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, 1]) -> [256, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
 
-Found 6 PreCompute nodes
-	NodeName: featNorm.xMean
-	NodeName: featNorm.xStdDev
-	NodeName: logPrior.Prior
-	NodeName: featNorm.xMean
-	NodeName: featNorm.xStdDev
-	NodeName: logPrior.Prior
+Validating for node cr. 183 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr. 60 nodes to process in pass 3.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+Validating for node cr, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 1], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 1]) -> [33, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 1], LSTMoutput[1].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 1], LSTMoutput[1].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 1], LSTMoutput[1].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 1], LSTMoutput[1].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 1], LSTMoutput[2].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 1], LSTMoutput[2].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 1], LSTMoutput[2].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 1], LSTMoutput[2].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 1], LSTMoutput[3].it.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 1], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 1], LSTMoutput[3].bit./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 1], LSTMoutput[3].bit[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 1], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 1], LSTMoutput[3].mt./*.**/right[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 1]) -> [1024, MBSize 1]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 1]) -> [256, MBSize 1]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 1]) -> [132, MBSize 1]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 1], B[132, 1]) -> [132, MBSize 1]
+Validating --> cr = CrossEntropyWithSoftmax(labels[132, MBSize 1], LSTMoutputW[132, MBSize 1]) -> [1, 1]
+
+127 out of 272 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> 3 PreCompute nodes found.
+
+	NodeName: featNorm.invStdDevVector
+	NodeName: featNorm.meanVector
+	NodeName: logPrior.x
 minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node featNorm.xMean 
+Validating for node featNorm.invStdDevVector. 3 nodes to process in pass 1.
 
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 640])
-Validating --> featNorm.xMean = Mean(feashift[33, 640])
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+
+Validating for node featNorm.invStdDevVector, final verification.
+
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 1]) -> [33, 1]
+
+1 out of 3 nodes do not share the minibatch layout with the input data.
 
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node featNorm.xStdDev 
+Validating for node featNorm.meanVector. 3 nodes to process in pass 1.
 
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 640])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 640])
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+
+Validating for node featNorm.meanVector, final verification.
+
+Validating --> features = InputValue -> [363, MBSize 1]
+Validating --> feashift = RowSlice(features[363, MBSize 1]) -> [33, MBSize 1]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 1]) -> [33, 1]
+
+1 out of 3 nodes do not share the minibatch layout with the input data.
 
  nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
 
-Validating node logPrior.Prior 
+Validating for node logPrior.x. 2 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> logPrior.Prior = Mean(labels[132, 640])
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
+
+Validating for node logPrior.x. 1 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
+
+Validating for node logPrior.x, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 1]
+Validating --> logPrior.x = Mean(labels[132, MBSize 1]) -> [132, 1]
+
+1 out of 2 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> Completed.
 
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.000781  momentum = 0.000000 
+Starting Epoch 1: learning rate per sample = 0.000781  effective momentum = 0.000000 
 minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+ nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	 nodes in the recurrent loops : 
+LSTMoutput[1].dh	LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ot.z./*+*/left./*+*/right	LSTMoutput[1].ot.z./*+*/left	LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[1].ft.z./*+*/left./*+*/right	LSTMoutput[1].ft.z./*+*/left	LSTMoutput[1].dc	LSTMoutput[1].ft.z./*+*/right.matrix	LSTMoutput[1].ft.z./*+*/right	LSTMoutput[1].ft.z	LSTMoutput[1].ft	LSTMoutput[1].bft	LSTMoutput[1].it.z./*+*/left./*+*/right./***/right	LSTMoutput[1].it.z./*+*/left./*+*/right	LSTMoutput[1].it.z./*+*/left	LSTMoutput[1].it.z./*+*/right.matrix	LSTMoutput[1].it.z./*+*/right	LSTMoutput[1].it.z	LSTMoutput[1].it	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[1].bit./*.**/right.z./*+*/right	LSTMoutput[1].bit./*.**/right.z	LSTMoutput[1].bit./*.**/right	LSTMoutput[1].bit	LSTMoutput[1].ct	LSTMoutput[1].ot.z./*+*/right.matrix	LSTMoutput[1].ot.z./*+*/right	LSTMoutput[1].ot.z	LSTMoutput[1].ot	LSTMoutput[1].mt./*.**/right	LSTMoutput[1].mt	LSTMoutput[1].output./***/right	LSTMoutput[1].output	 nodes in the recurrent loops : 
+LSTMoutput[2].dh	LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ot.z./*+*/left./*+*/right	LSTMoutput[2].ot.z./*+*/left	LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[2].ft.z./*+*/left./*+*/right	LSTMoutput[2].ft.z./*+*/left	LSTMoutput[2].dc	LSTMoutput[2].ft.z./*+*/right.matrix	LSTMoutput[2].ft.z./*+*/right	LSTMoutput[2].ft.z	LSTMoutput[2].ft	LSTMoutput[2].bft	LSTMoutput[2].it.z./*+*/left./*+*/right./***/right	LSTMoutput[2].it.z./*+*/left./*+*/right	LSTMoutput[2].it.z./*+*/left	LSTMoutput[2].it.z./*+*/right.matrix	LSTMoutput[2].it.z./*+*/right	LSTMoutput[2].it.z	LSTMoutput[2].it	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[2].bit./*.**/right.z./*+*/right	LSTMoutput[2].bit./*.**/right.z	LSTMoutput[2].bit./*.**/right	LSTMoutput[2].bit	LSTMoutput[2].ct	LSTMoutput[2].ot.z./*+*/right.matrix	LSTMoutput[2].ot.z./*+*/right	LSTMoutput[2].ot.z	LSTMoutput[2].ot	LSTMoutput[2].mt./*.**/right	LSTMoutput[2].mt	LSTMoutput[2].output./***/right	LSTMoutput[2].output	 nodes in the recurrent loops : 
+LSTMoutput[3].dh	LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ot.z./*+*/left./*+*/right	LSTMoutput[3].ot.z./*+*/left	LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right	LSTMoutput[3].ft.z./*+*/left./*+*/right	LSTMoutput[3].ft.z./*+*/left	LSTMoutput[3].dc	LSTMoutput[3].ft.z./*+*/right.matrix	LSTMoutput[3].ft.z./*+*/right	LSTMoutput[3].ft.z	LSTMoutput[3].ft	LSTMoutput[3].bft	LSTMoutput[3].it.z./*+*/left./*+*/right./***/right	LSTMoutput[3].it.z./*+*/left./*+*/right	LSTMoutput[3].it.z./*+*/left	LSTMoutput[3].it.z./*+*/right.matrix	LSTMoutput[3].it.z./*+*/right	LSTMoutput[3].it.z	LSTMoutput[3].it	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right	LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left	LSTMoutput[3].bit./*.**/right.z./*+*/right	LSTMoutput[3].bit./*.**/right.z	LSTMoutput[3].bit./*.**/right	LSTMoutput[3].bit	LSTMoutput[3].ct	LSTMoutput[3].ot.z./*+*/right.matrix	LSTMoutput[3].ot.z./*+*/right	LSTMoutput[3].ot.z	LSTMoutput[3].ot	LSTMoutput[3].mt./*.**/right	LSTMoutput[3].mt	LSTMoutput[3].output./***/right	LSTMoutput[3].output	
+
+Validating for node Err. 272 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 640]
+Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1]
+
+Validating for node Err. 180 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 640]
+Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1]
+
+Validating for node Err. 6 nodes to process in pass 3.
+
+Validating --> labels = InputValue -> [132, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 640]
+Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1]
+
+Validating for node Err, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/left = LearnableParameter -> [132, 256]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutputW./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutputW./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].output./***/right.scalarScalingFactor = Exp(LSTMoutput[3].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].output./***/right.scalarScalingFactor = Exp(LSTMoutput[2].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].Wmr = LearnableParameter -> [256, 1024]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].output./***/right.scalarScalingFactor = Exp(LSTMoutput[1].output./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> features = InputValue -> [363, MBSize 640]
+Validating --> feashift = RowSlice(features[363, MBSize 640]) -> [33, MBSize 640]
+Validating --> featNorm.meanVector = Mean(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm.invStdDevVector = InvStdDev(feashift[33, MBSize 640]) -> [33, 1]
+Validating --> featNorm = PerDimMeanVarNormalization(feashift[33, MBSize 640], featNorm.meanVector[33, 1], featNorm.invStdDevVector[33, 1]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 33], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 33]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], featNorm[33, MBSize 640]) -> [33, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/left./***/left[1024, 33], LSTMoutput[1].bit./*.**/right.z./*+*/left./***/right[33, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[1].dh = PastValue(LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left./*+*/right = Times(LSTMoutput[1].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/left = Plus(LSTMoutput[1].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left./*+*/right = Times(LSTMoutput[1].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/left = Plus(LSTMoutput[1].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].dc = PastValue(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right.matrix = Scale(LSTMoutput[1].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z./*+*/right = DiagTimes(LSTMoutput[1].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft.z = Plus(LSTMoutput[1].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ft = Sigmoid(LSTMoutput[1].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bft = ElementTimes(LSTMoutput[1].ft[1024, MBSize 640], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[1].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left./*+*/right = Times(LSTMoutput[1].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[1].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/left = Plus(LSTMoutput[1].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right.matrix = Scale(LSTMoutput[1].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z./*+*/right = DiagTimes(LSTMoutput[1].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it.z = Plus(LSTMoutput[1].it.z./*+*/left[1024, MBSize 640], LSTMoutput[1].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].it = Sigmoid(LSTMoutput[1].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right.z = Plus(LSTMoutput[1].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[1].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit./*.**/right = Tanh(LSTMoutput[1].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].bit = ElementTimes(LSTMoutput[1].it[1024, MBSize 640], LSTMoutput[1].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ct = Plus(LSTMoutput[1].bft[1024, MBSize 640], LSTMoutput[1].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right.matrix = Scale(LSTMoutput[1].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z./*+*/right = DiagTimes(LSTMoutput[1].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[1].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot.z = Plus(LSTMoutput[1].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[1].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].ot = Sigmoid(LSTMoutput[1].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt./*.**/right = Tanh(LSTMoutput[1].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].mt = ElementTimes(LSTMoutput[1].ot[1024, MBSize 640], LSTMoutput[1].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output./***/right = Scale(LSTMoutput[1].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[1].output = Times(LSTMoutput[1].Wmr[256, 1024], LSTMoutput[1].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[1].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[2].dh = PastValue(LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left./*+*/right = Times(LSTMoutput[2].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/left = Plus(LSTMoutput[2].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left./*+*/right = Times(LSTMoutput[2].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/left = Plus(LSTMoutput[2].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].dc = PastValue(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right.matrix = Scale(LSTMoutput[2].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z./*+*/right = DiagTimes(LSTMoutput[2].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft.z = Plus(LSTMoutput[2].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ft = Sigmoid(LSTMoutput[2].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bft = ElementTimes(LSTMoutput[2].ft[1024, MBSize 640], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[2].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left./*+*/right = Times(LSTMoutput[2].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[2].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/left = Plus(LSTMoutput[2].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right.matrix = Scale(LSTMoutput[2].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z./*+*/right = DiagTimes(LSTMoutput[2].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it.z = Plus(LSTMoutput[2].it.z./*+*/left[1024, MBSize 640], LSTMoutput[2].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].it = Sigmoid(LSTMoutput[2].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right.z = Plus(LSTMoutput[2].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[2].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit./*.**/right = Tanh(LSTMoutput[2].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].bit = ElementTimes(LSTMoutput[2].it[1024, MBSize 640], LSTMoutput[2].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ct = Plus(LSTMoutput[2].bft[1024, MBSize 640], LSTMoutput[2].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right.matrix = Scale(LSTMoutput[2].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z./*+*/right = DiagTimes(LSTMoutput[2].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[2].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot.z = Plus(LSTMoutput[2].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[2].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].ot = Sigmoid(LSTMoutput[2].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt./*.**/right = Tanh(LSTMoutput[2].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].mt = ElementTimes(LSTMoutput[2].ot[1024, MBSize 640], LSTMoutput[2].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output./***/right = Scale(LSTMoutput[2].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[2].output = Times(LSTMoutput[2].Wmr[256, 1024], LSTMoutput[2].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left = Times(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/left./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor = Exp(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[2].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left = LearnableParameter -> [1024, 256]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x = LearnableParameter -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor = Exp(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor.x[1, 1]) -> [1, 1]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right = LearnableParameter -> [1024, 1]
+Validating --> LSTMoutput[3].dh = PastValue(LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left./*+*/right = Times(LSTMoutput[3].ot.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ot.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/left = Plus(LSTMoutput[3].ot.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left./*+*/right = Times(LSTMoutput[3].ft.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].ft.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/left = Plus(LSTMoutput[3].ft.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].dc = PastValue(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right.matrix = Scale(LSTMoutput[3].ft.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z./*+*/right = DiagTimes(LSTMoutput[3].ft.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ft.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft.z = Plus(LSTMoutput[3].ft.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ft.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ft = Sigmoid(LSTMoutput[3].ft.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bft = ElementTimes(LSTMoutput[3].ft[1024, MBSize 640], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right./***/right = Scale(LSTMoutput[3].it.z./*+*/left./*+*/right./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left./*+*/right = Times(LSTMoutput[3].it.z./*+*/left./*+*/right./***/left[1024, 256], LSTMoutput[3].it.z./*+*/left./*+*/right./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/left = Plus(LSTMoutput[3].it.z./*+*/left./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/left./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right.matrix = Scale(LSTMoutput[3].it.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].dc[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z./*+*/right = DiagTimes(LSTMoutput[3].it.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].it.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it.z = Plus(LSTMoutput[3].it.z./*+*/left[1024, MBSize 640], LSTMoutput[3].it.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].it = Sigmoid(LSTMoutput[3].it.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right = Scale(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].dh[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left = Times(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/left[1024, 256], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left./***/right[256, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z./*+*/right = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right./*+*/right[1024, 1]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right.z = Plus(LSTMoutput[3].bit./*.**/right.z./*+*/left[1024, MBSize 640], LSTMoutput[3].bit./*.**/right.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit./*.**/right = Tanh(LSTMoutput[3].bit./*.**/right.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].bit = ElementTimes(LSTMoutput[3].it[1024, MBSize 640], LSTMoutput[3].bit./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ct = Plus(LSTMoutput[3].bft[1024, MBSize 640], LSTMoutput[3].bit[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right.matrix = Scale(LSTMoutput[3].ot.z./*+*/right.matrix.scalarScalingFactor[1, 1], LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z./*+*/right = DiagTimes(LSTMoutput[3].ot.z./*+*/right.diagonalMatrixAsColumnVector[1024, 1], LSTMoutput[3].ot.z./*+*/right.matrix[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot.z = Plus(LSTMoutput[3].ot.z./*+*/left[1024, MBSize 640], LSTMoutput[3].ot.z./*+*/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].ot = Sigmoid(LSTMoutput[3].ot.z[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt./*.**/right = Tanh(LSTMoutput[3].ct[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].mt = ElementTimes(LSTMoutput[3].ot[1024, MBSize 640], LSTMoutput[3].mt./*.**/right[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output./***/right = Scale(LSTMoutput[3].output./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].mt[1024, MBSize 640]) -> [1024, MBSize 640]
+Validating --> LSTMoutput[3].output = Times(LSTMoutput[3].Wmr[256, 1024], LSTMoutput[3].output./***/right[1024, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left./***/right = Scale(LSTMoutputW./*+*/left./***/right.scalarScalingFactor[1, 1], LSTMoutput[3].output[256, MBSize 640]) -> [256, MBSize 640]
+Validating --> LSTMoutputW./*+*/left = Times(LSTMoutputW./*+*/left./***/left[132, 256], LSTMoutputW./*+*/left./***/right[256, MBSize 640]) -> [132, MBSize 640]
+Validating --> B = LearnableParameter -> [132, 1]
+Validating --> LSTMoutputW = Plus(LSTMoutputW./*+*/left[132, MBSize 640], B[132, 1]) -> [132, MBSize 640]
+Validating --> Err = ErrorPrediction(labels[132, MBSize 640], LSTMoutputW[132, MBSize 640]) -> [1, 1]
+
+127 out of 272 nodes do not share the minibatch layout with the input data.
+
 
 Starting minibatch loop.
- nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	 nodes in the recurrent loops : 
-LSTMoutput1.dh	LSTMoutput1.unnamed169	LSTMoutput1.Whodh	LSTMoutput1.unnamed172	LSTMoutput1.unnamed163	LSTMoutput1.Whfdh	LSTMoutput1.unnamed166	LSTMoutput1.dc	LSTMoutput1.unnamed164	LSTMoutput1.Wcfdc	LSTMoutput1.unnamed165	LSTMoutput1.ft	LSTMoutput1.bft	LSTMoutput1.unnamed152	LSTMoutput1.Whidh	LSTMoutput1.unnamed155	LSTMoutput1.unnamed153	LSTMoutput1.Wcidc	LSTMoutput1.unnamed154	LSTMoutput1.it	LSTMoutput1.unnamed158	LSTMoutput1.Whcdh	LSTMoutput1.unnamed161	LSTMoutput1.unnamed160	LSTMoutput1.unnamed159	LSTMoutput1.bit	LSTMoutput1.ct	LSTMoutput1.unnamed170	LSTMoutput1.Wcoct	LSTMoutput1.unnamed171	LSTMoutput1.ot	LSTMoutput1.unnamed174	LSTMoutput1.mt	LSTMoutput1.unnamed175	LSTMoutput1.output	 nodes in the recurrent loops : 
-LSTMoutput2.dh	LSTMoutput2.unnamed219	LSTMoutput2.Whodh	LSTMoutput2.unnamed222	LSTMoutput2.unnamed213	LSTMoutput2.Whfdh	LSTMoutput2.unnamed216	LSTMoutput2.dc	LSTMoutput2.unnamed214	LSTMoutput2.Wcfdc	LSTMoutput2.unnamed215	LSTMoutput2.ft	LSTMoutput2.bft	LSTMoutput2.unnamed202	LSTMoutput2.Whidh	LSTMoutput2.unnamed205	LSTMoutput2.unnamed203	LSTMoutput2.Wcidc	LSTMoutput2.unnamed204	LSTMoutput2.it	LSTMoutput2.unnamed208	LSTMoutput2.Whcdh	LSTMoutput2.unnamed211	LSTMoutput2.unnamed210	LSTMoutput2.unnamed209	LSTMoutput2.bit	LSTMoutput2.ct	LSTMoutput2.unnamed220	LSTMoutput2.Wcoct	LSTMoutput2.unnamed221	LSTMoutput2.ot	LSTMoutput2.unnamed224	LSTMoutput2.mt	LSTMoutput2.unnamed225	LSTMoutput2.output	 nodes in the recurrent loops : 
-LSTMoutput3.dh	LSTMoutput3.unnamed269	LSTMoutput3.Whodh	LSTMoutput3.unnamed272	LSTMoutput3.unnamed263	LSTMoutput3.Whfdh	LSTMoutput3.unnamed266	LSTMoutput3.dc	LSTMoutput3.unnamed264	LSTMoutput3.Wcfdc	LSTMoutput3.unnamed265	LSTMoutput3.ft	LSTMoutput3.bft	LSTMoutput3.unnamed252	LSTMoutput3.Whidh	LSTMoutput3.unnamed255	LSTMoutput3.unnamed253	LSTMoutput3.Wcidc	LSTMoutput3.unnamed254	LSTMoutput3.it	LSTMoutput3.unnamed258	LSTMoutput3.Whcdh	LSTMoutput3.unnamed261	LSTMoutput3.unnamed260	LSTMoutput3.unnamed259	LSTMoutput3.bit	LSTMoutput3.ct	LSTMoutput3.unnamed270	LSTMoutput3.Wcoct	LSTMoutput3.unnamed271	LSTMoutput3.ot	LSTMoutput3.unnamed274	LSTMoutput3.mt	LSTMoutput3.unnamed275	LSTMoutput3.output	
-
-Validating node Err 
-
-Validating --> labels = InputValue
-Validating --> W = LearnableParameter
-Validating --> sW = LearnableParameter
-Validating --> expsW = Exp(sW[1, 1])
-Validating --> LSTMoutput3.Wmr = LearnableParameter
-Validating --> LSTMoutput3.sWmr = LearnableParameter
-Validating --> LSTMoutput3.expsWmr = Exp(LSTMoutput3.sWmr[1, 1])
-Validating --> LSTMoutput3.Wxo = LearnableParameter
-Validating --> LSTMoutput3.sWxo = LearnableParameter
-Validating --> LSTMoutput3.expsWxo = Exp(LSTMoutput3.sWxo[1, 1])
-Validating --> LSTMoutput2.Wmr = LearnableParameter
-Validating --> LSTMoutput2.sWmr = LearnableParameter
-Validating --> LSTMoutput2.expsWmr = Exp(LSTMoutput2.sWmr[1, 1])
-Validating --> LSTMoutput2.Wxo = LearnableParameter
-Validating --> LSTMoutput2.sWxo = LearnableParameter
-Validating --> LSTMoutput2.expsWxo = Exp(LSTMoutput2.sWxo[1, 1])
-Validating --> LSTMoutput1.Wmr = LearnableParameter
-Validating --> LSTMoutput1.sWmr = LearnableParameter
-Validating --> LSTMoutput1.expsWmr = Exp(LSTMoutput1.sWmr[1, 1])
-Validating --> LSTMoutput1.Wxo = LearnableParameter
-Validating --> LSTMoutput1.sWxo = LearnableParameter
-Validating --> LSTMoutput1.expsWxo = Exp(LSTMoutput1.sWxo[1, 1])
-Validating --> features = InputValue
-Validating --> feashift = RowSlice(features[363, 640])
-Validating --> featNorm.xMean = Mean(feashift[33, 640])
-Validating --> featNorm.xStdDev = InvStdDev(feashift[33, 640])
-Validating --> featNorm.xNorm = PerDimMeanVarNormalization(feashift[33, 640], featNorm.xMean[33, 1], featNorm.xStdDev[33, 1])
-Validating --> LSTMoutput1.unnamed168 = Scale(LSTMoutput1.expsWxo[1, 1], featNorm.xNorm[33, 640])
-Validating --> LSTMoutput1.Wxox = Times(LSTMoutput1.Wxo[1024, 33], LSTMoutput1.unnamed168[33, 640])
-Validating --> LSTMoutput1.bo = LearnableParameter
-Validating --> LSTMoutput1.unnamed173 = Plus(LSTMoutput1.Wxox[1024, 640], LSTMoutput1.bo[1024, 1])
-Validating --> LSTMoutput1.Who = LearnableParameter
-Validating --> LSTMoutput1.sWho = LearnableParameter
-Validating --> LSTMoutput1.expsWho = Exp(LSTMoutput1.sWho[1, 1])
-Validating --> LSTMoutput1.Wco = LearnableParameter
-Validating --> LSTMoutput1.sWco = LearnableParameter
-Validating --> LSTMoutput1.expsWco = Exp(LSTMoutput1.sWco[1, 1])
-Validating --> LSTMoutput1.Wxf = LearnableParameter
-Validating --> LSTMoutput1.sWxf = LearnableParameter
-Validating --> LSTMoutput1.expsWxf = Exp(LSTMoutput1.sWxf[1, 1])
-Validating --> LSTMoutput1.unnamed162 = Scale(LSTMoutput1.expsWxf[1, 1], featNorm.xNorm[33, 640])
-Validating --> LSTMoutput1.Wxfx = Times(LSTMoutput1.Wxf[1024, 33], LSTMoutput1.unnamed162[33, 640])
-Validating --> LSTMoutput1.bf = LearnableParameter
-Validating --> LSTMoutput1.unnamed167 = Plus(LSTMoutput1.Wxfx[1024, 640], LSTMoutput1.bf[1024, 1])
-Validating --> LSTMoutput1.Whf = LearnableParameter
-Validating --> LSTMoutput1.sWhf = LearnableParameter
-Validating --> LSTMoutput1.expsWhf = Exp(LSTMoutput1.sWhf[1, 1])
-Validating --> LSTMoutput1.Wcf = LearnableParameter
-Validating --> LSTMoutput1.sWcf = LearnableParameter
-Validating --> LSTMoutput1.expsWcf = Exp(LSTMoutput1.sWcf[1, 1])
-Validating --> LSTMoutput1.Wxi = LearnableParameter
-Validating --> LSTMoutput1.sWxi = LearnableParameter
-Validating --> LSTMoutput1.expsWxi = Exp(LSTMoutput1.sWxi[1, 1])
-Validating --> LSTMoutput1.unnamed151 = Scale(LSTMoutput1.expsWxi[1, 1], featNorm.xNorm[33, 640])
-Validating --> LSTMoutput1.Wxix = Times(LSTMoutput1.Wxi[1024, 33], LSTMoutput1.unnamed151[33, 640])
-Validating --> LSTMoutput1.bi = LearnableParameter
-Validating --> LSTMoutput1.unnamed156 = Plus(LSTMoutput1.Wxix[1024, 640], LSTMoutput1.bi[1024, 1])
-Validating --> LSTMoutput1.Whi = LearnableParameter
-Validating --> LSTMoutput1.sWhi = LearnableParameter
-Validating --> LSTMoutput1.expsWhi = Exp(LSTMoutput1.sWhi[1, 1])
-Validating --> LSTMoutput1.Wci = LearnableParameter
-Validating --> LSTMoutput1.sWci = LearnableParameter
-Validating --> LSTMoutput1.expsWci = Exp(LSTMoutput1.sWci[1, 1])
-Validating --> LSTMoutput1.Wxc = LearnableParameter
-Validating --> LSTMoutput1.sWxc = LearnableParameter
-Validating --> LSTMoutput1.expsWxc = Exp(LSTMoutput1.sWxc[1, 1])
-Validating --> LSTMoutput1.unnamed157 = Scale(LSTMoutput1.expsWxc[1, 1], featNorm.xNorm[33, 640])
-Validating --> LSTMoutput1.Wxcx = Times(LSTMoutput1.Wxc[1024, 33], LSTMoutput1.unnamed157[33, 640])
-Validating --> LSTMoutput1.Whc = LearnableParameter
-Validating --> LSTMoutput1.sWhc = LearnableParameter
-Validating --> LSTMoutput1.expsWhc = Exp(LSTMoutput1.sWhc[1, 1])
-Validating --> LSTMoutput1.bc = LearnableParameter
-Validating --> LSTMoutput1.dh = PastValue(LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput1.unnamed169 = Scale(LSTMoutput1.expsWho[1, 1], LSTMoutput1.dh[256, 640])
-Validating --> LSTMoutput1.Whodh = Times(LSTMoutput1.Who[1024, 256], LSTMoutput1.unnamed169[256, 640])
-Validating --> LSTMoutput1.unnamed172 = Plus(LSTMoutput1.unnamed173[1024, 640], LSTMoutput1.Whodh[1024, 640])
-Validating --> LSTMoutput1.unnamed163 = Scale(LSTMoutput1.expsWhf[1, 1], LSTMoutput1.dh[256, 640])
-Validating --> LSTMoutput1.Whfdh = Times(LSTMoutput1.Whf[1024, 256], LSTMoutput1.unnamed163[256, 640])
-Validating --> LSTMoutput1.unnamed166 = Plus(LSTMoutput1.unnamed167[1024, 640], LSTMoutput1.Whfdh[1024, 640])
-Validating --> LSTMoutput1.dc = PastValue(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed164 = Scale(LSTMoutput1.expsWcf[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.Wcfdc = DiagTimes(LSTMoutput1.Wcf[1024, 1], LSTMoutput1.unnamed164[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed165 = Plus(LSTMoutput1.unnamed166[1024, 640], LSTMoutput1.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.ft = Sigmoid(LSTMoutput1.unnamed165[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.bft = ElementTimes(LSTMoutput1.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed152 = Scale(LSTMoutput1.expsWhi[1, 1], LSTMoutput1.dh[256, 640])
-Validating --> LSTMoutput1.Whidh = Times(LSTMoutput1.Whi[1024, 256], LSTMoutput1.unnamed152[256, 640])
-Validating --> LSTMoutput1.unnamed155 = Plus(LSTMoutput1.unnamed156[1024, 640], LSTMoutput1.Whidh[1024, 640])
-Validating --> LSTMoutput1.unnamed153 = Scale(LSTMoutput1.expsWci[1, 1], LSTMoutput1.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.Wcidc = DiagTimes(LSTMoutput1.Wci[1024, 1], LSTMoutput1.unnamed153[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed154 = Plus(LSTMoutput1.unnamed155[1024, 640], LSTMoutput1.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.it = Sigmoid(LSTMoutput1.unnamed154[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed158 = Scale(LSTMoutput1.expsWhc[1, 1], LSTMoutput1.dh[256, 640])
-Validating --> LSTMoutput1.Whcdh = Times(LSTMoutput1.Whc[1024, 256], LSTMoutput1.unnamed158[256, 640])
-Validating --> LSTMoutput1.unnamed161 = Plus(LSTMoutput1.Whcdh[1024, 640], LSTMoutput1.bc[1024, 1])
-Validating --> LSTMoutput1.unnamed160 = Plus(LSTMoutput1.Wxcx[1024, 640], LSTMoutput1.unnamed161[1024, 640])
-Validating --> LSTMoutput1.unnamed159 = Tanh(LSTMoutput1.unnamed160[1024, 640])
-Validating --> LSTMoutput1.bit = ElementTimes(LSTMoutput1.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput1.unnamed159[1024, 640])
-Validating --> LSTMoutput1.ct = Plus(LSTMoutput1.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput1.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed170 = Scale(LSTMoutput1.expsWco[1, 1], LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.Wcoct = DiagTimes(LSTMoutput1.Wco[1024, 1], LSTMoutput1.unnamed170[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed171 = Plus(LSTMoutput1.unnamed172[1024, 640], LSTMoutput1.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.ot = Sigmoid(LSTMoutput1.unnamed171[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed174 = Tanh(LSTMoutput1.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.mt = ElementTimes(LSTMoutput1.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput1.unnamed174[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.unnamed175 = Scale(LSTMoutput1.expsWmr[1, 1], LSTMoutput1.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput1.output = Times(LSTMoutput1.Wmr[256, 1024], LSTMoutput1.unnamed175[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed218 = Scale(LSTMoutput2.expsWxo[1, 1], LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput2.Wxox = Times(LSTMoutput2.Wxo[1024, 256], LSTMoutput2.unnamed218[256, 640])
-Validating --> LSTMoutput2.bo = LearnableParameter
-Validating --> LSTMoutput2.unnamed223 = Plus(LSTMoutput2.Wxox[1024, 640], LSTMoutput2.bo[1024, 1])
-Validating --> LSTMoutput2.Who = LearnableParameter
-Validating --> LSTMoutput2.sWho = LearnableParameter
-Validating --> LSTMoutput2.expsWho = Exp(LSTMoutput2.sWho[1, 1])
-Validating --> LSTMoutput2.Wco = LearnableParameter
-Validating --> LSTMoutput2.sWco = LearnableParameter
-Validating --> LSTMoutput2.expsWco = Exp(LSTMoutput2.sWco[1, 1])
-Validating --> LSTMoutput2.Wxf = LearnableParameter
-Validating --> LSTMoutput2.sWxf = LearnableParameter
-Validating --> LSTMoutput2.expsWxf = Exp(LSTMoutput2.sWxf[1, 1])
-Validating --> LSTMoutput2.unnamed212 = Scale(LSTMoutput2.expsWxf[1, 1], LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput2.Wxfx = Times(LSTMoutput2.Wxf[1024, 256], LSTMoutput2.unnamed212[256, 640])
-Validating --> LSTMoutput2.bf = LearnableParameter
-Validating --> LSTMoutput2.unnamed217 = Plus(LSTMoutput2.Wxfx[1024, 640], LSTMoutput2.bf[1024, 1])
-Validating --> LSTMoutput2.Whf = LearnableParameter
-Validating --> LSTMoutput2.sWhf = LearnableParameter
-Validating --> LSTMoutput2.expsWhf = Exp(LSTMoutput2.sWhf[1, 1])
-Validating --> LSTMoutput2.Wcf = LearnableParameter
-Validating --> LSTMoutput2.sWcf = LearnableParameter
-Validating --> LSTMoutput2.expsWcf = Exp(LSTMoutput2.sWcf[1, 1])
-Validating --> LSTMoutput2.Wxi = LearnableParameter
-Validating --> LSTMoutput2.sWxi = LearnableParameter
-Validating --> LSTMoutput2.expsWxi = Exp(LSTMoutput2.sWxi[1, 1])
-Validating --> LSTMoutput2.unnamed201 = Scale(LSTMoutput2.expsWxi[1, 1], LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput2.Wxix = Times(LSTMoutput2.Wxi[1024, 256], LSTMoutput2.unnamed201[256, 640])
-Validating --> LSTMoutput2.bi = LearnableParameter
-Validating --> LSTMoutput2.unnamed206 = Plus(LSTMoutput2.Wxix[1024, 640], LSTMoutput2.bi[1024, 1])
-Validating --> LSTMoutput2.Whi = LearnableParameter
-Validating --> LSTMoutput2.sWhi = LearnableParameter
-Validating --> LSTMoutput2.expsWhi = Exp(LSTMoutput2.sWhi[1, 1])
-Validating --> LSTMoutput2.Wci = LearnableParameter
-Validating --> LSTMoutput2.sWci = LearnableParameter
-Validating --> LSTMoutput2.expsWci = Exp(LSTMoutput2.sWci[1, 1])
-Validating --> LSTMoutput2.Wxc = LearnableParameter
-Validating --> LSTMoutput2.sWxc = LearnableParameter
-Validating --> LSTMoutput2.expsWxc = Exp(LSTMoutput2.sWxc[1, 1])
-Validating --> LSTMoutput2.unnamed207 = Scale(LSTMoutput2.expsWxc[1, 1], LSTMoutput1.output[256, 640])
-Validating --> LSTMoutput2.Wxcx = Times(LSTMoutput2.Wxc[1024, 256], LSTMoutput2.unnamed207[256, 640])
-Validating --> LSTMoutput2.Whc = LearnableParameter
-Validating --> LSTMoutput2.sWhc = LearnableParameter
-Validating --> LSTMoutput2.expsWhc = Exp(LSTMoutput2.sWhc[1, 1])
-Validating --> LSTMoutput2.bc = LearnableParameter
-Validating --> LSTMoutput2.dh = PastValue(LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput2.unnamed219 = Scale(LSTMoutput2.expsWho[1, 1], LSTMoutput2.dh[256, 640])
-Validating --> LSTMoutput2.Whodh = Times(LSTMoutput2.Who[1024, 256], LSTMoutput2.unnamed219[256, 640])
-Validating --> LSTMoutput2.unnamed222 = Plus(LSTMoutput2.unnamed223[1024, 640], LSTMoutput2.Whodh[1024, 640])
-Validating --> LSTMoutput2.unnamed213 = Scale(LSTMoutput2.expsWhf[1, 1], LSTMoutput2.dh[256, 640])
-Validating --> LSTMoutput2.Whfdh = Times(LSTMoutput2.Whf[1024, 256], LSTMoutput2.unnamed213[256, 640])
-Validating --> LSTMoutput2.unnamed216 = Plus(LSTMoutput2.unnamed217[1024, 640], LSTMoutput2.Whfdh[1024, 640])
-Validating --> LSTMoutput2.dc = PastValue(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed214 = Scale(LSTMoutput2.expsWcf[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.Wcfdc = DiagTimes(LSTMoutput2.Wcf[1024, 1], LSTMoutput2.unnamed214[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed215 = Plus(LSTMoutput2.unnamed216[1024, 640], LSTMoutput2.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.ft = Sigmoid(LSTMoutput2.unnamed215[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.bft = ElementTimes(LSTMoutput2.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed202 = Scale(LSTMoutput2.expsWhi[1, 1], LSTMoutput2.dh[256, 640])
-Validating --> LSTMoutput2.Whidh = Times(LSTMoutput2.Whi[1024, 256], LSTMoutput2.unnamed202[256, 640])
-Validating --> LSTMoutput2.unnamed205 = Plus(LSTMoutput2.unnamed206[1024, 640], LSTMoutput2.Whidh[1024, 640])
-Validating --> LSTMoutput2.unnamed203 = Scale(LSTMoutput2.expsWci[1, 1], LSTMoutput2.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.Wcidc = DiagTimes(LSTMoutput2.Wci[1024, 1], LSTMoutput2.unnamed203[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed204 = Plus(LSTMoutput2.unnamed205[1024, 640], LSTMoutput2.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.it = Sigmoid(LSTMoutput2.unnamed204[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed208 = Scale(LSTMoutput2.expsWhc[1, 1], LSTMoutput2.dh[256, 640])
-Validating --> LSTMoutput2.Whcdh = Times(LSTMoutput2.Whc[1024, 256], LSTMoutput2.unnamed208[256, 640])
-Validating --> LSTMoutput2.unnamed211 = Plus(LSTMoutput2.Whcdh[1024, 640], LSTMoutput2.bc[1024, 1])
-Validating --> LSTMoutput2.unnamed210 = Plus(LSTMoutput2.Wxcx[1024, 640], LSTMoutput2.unnamed211[1024, 640])
-Validating --> LSTMoutput2.unnamed209 = Tanh(LSTMoutput2.unnamed210[1024, 640])
-Validating --> LSTMoutput2.bit = ElementTimes(LSTMoutput2.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput2.unnamed209[1024, 640])
-Validating --> LSTMoutput2.ct = Plus(LSTMoutput2.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput2.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed220 = Scale(LSTMoutput2.expsWco[1, 1], LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.Wcoct = DiagTimes(LSTMoutput2.Wco[1024, 1], LSTMoutput2.unnamed220[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed221 = Plus(LSTMoutput2.unnamed222[1024, 640], LSTMoutput2.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.ot = Sigmoid(LSTMoutput2.unnamed221[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed224 = Tanh(LSTMoutput2.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.mt = ElementTimes(LSTMoutput2.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput2.unnamed224[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.unnamed225 = Scale(LSTMoutput2.expsWmr[1, 1], LSTMoutput2.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput2.output = Times(LSTMoutput2.Wmr[256, 1024], LSTMoutput2.unnamed225[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed268 = Scale(LSTMoutput3.expsWxo[1, 1], LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput3.Wxox = Times(LSTMoutput3.Wxo[1024, 256], LSTMoutput3.unnamed268[256, 640])
-Validating --> LSTMoutput3.bo = LearnableParameter
-Validating --> LSTMoutput3.unnamed273 = Plus(LSTMoutput3.Wxox[1024, 640], LSTMoutput3.bo[1024, 1])
-Validating --> LSTMoutput3.Who = LearnableParameter
-Validating --> LSTMoutput3.sWho = LearnableParameter
-Validating --> LSTMoutput3.expsWho = Exp(LSTMoutput3.sWho[1, 1])
-Validating --> LSTMoutput3.Wco = LearnableParameter
-Validating --> LSTMoutput3.sWco = LearnableParameter
-Validating --> LSTMoutput3.expsWco = Exp(LSTMoutput3.sWco[1, 1])
-Validating --> LSTMoutput3.Wxf = LearnableParameter
-Validating --> LSTMoutput3.sWxf = LearnableParameter
-Validating --> LSTMoutput3.expsWxf = Exp(LSTMoutput3.sWxf[1, 1])
-Validating --> LSTMoutput3.unnamed262 = Scale(LSTMoutput3.expsWxf[1, 1], LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput3.Wxfx = Times(LSTMoutput3.Wxf[1024, 256], LSTMoutput3.unnamed262[256, 640])
-Validating --> LSTMoutput3.bf = LearnableParameter
-Validating --> LSTMoutput3.unnamed267 = Plus(LSTMoutput3.Wxfx[1024, 640], LSTMoutput3.bf[1024, 1])
-Validating --> LSTMoutput3.Whf = LearnableParameter
-Validating --> LSTMoutput3.sWhf = LearnableParameter
-Validating --> LSTMoutput3.expsWhf = Exp(LSTMoutput3.sWhf[1, 1])
-Validating --> LSTMoutput3.Wcf = LearnableParameter
-Validating --> LSTMoutput3.sWcf = LearnableParameter
-Validating --> LSTMoutput3.expsWcf = Exp(LSTMoutput3.sWcf[1, 1])
-Validating --> LSTMoutput3.Wxi = LearnableParameter
-Validating --> LSTMoutput3.sWxi = LearnableParameter
-Validating --> LSTMoutput3.expsWxi = Exp(LSTMoutput3.sWxi[1, 1])
-Validating --> LSTMoutput3.unnamed251 = Scale(LSTMoutput3.expsWxi[1, 1], LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput3.Wxix = Times(LSTMoutput3.Wxi[1024, 256], LSTMoutput3.unnamed251[256, 640])
-Validating --> LSTMoutput3.bi = LearnableParameter
-Validating --> LSTMoutput3.unnamed256 = Plus(LSTMoutput3.Wxix[1024, 640], LSTMoutput3.bi[1024, 1])
-Validating --> LSTMoutput3.Whi = LearnableParameter
-Validating --> LSTMoutput3.sWhi = LearnableParameter
-Validating --> LSTMoutput3.expsWhi = Exp(LSTMoutput3.sWhi[1, 1])
-Validating --> LSTMoutput3.Wci = LearnableParameter
-Validating --> LSTMoutput3.sWci = LearnableParameter
-Validating --> LSTMoutput3.expsWci = Exp(LSTMoutput3.sWci[1, 1])
-Validating --> LSTMoutput3.Wxc = LearnableParameter
-Validating --> LSTMoutput3.sWxc = LearnableParameter
-Validating --> LSTMoutput3.expsWxc = Exp(LSTMoutput3.sWxc[1, 1])
-Validating --> LSTMoutput3.unnamed257 = Scale(LSTMoutput3.expsWxc[1, 1], LSTMoutput2.output[256, 640])
-Validating --> LSTMoutput3.Wxcx = Times(LSTMoutput3.Wxc[1024, 256], LSTMoutput3.unnamed257[256, 640])
-Validating --> LSTMoutput3.Whc = LearnableParameter
-Validating --> LSTMoutput3.sWhc = LearnableParameter
-Validating --> LSTMoutput3.expsWhc = Exp(LSTMoutput3.sWhc[1, 1])
-Validating --> LSTMoutput3.bc = LearnableParameter
-Validating --> LSTMoutput3.dh = PastValue(LSTMoutput3.output[256, 640])
-Validating --> LSTMoutput3.unnamed269 = Scale(LSTMoutput3.expsWho[1, 1], LSTMoutput3.dh[256, 640])
-Validating --> LSTMoutput3.Whodh = Times(LSTMoutput3.Who[1024, 256], LSTMoutput3.unnamed269[256, 640])
-Validating --> LSTMoutput3.unnamed272 = Plus(LSTMoutput3.unnamed273[1024, 640], LSTMoutput3.Whodh[1024, 640])
-Validating --> LSTMoutput3.unnamed263 = Scale(LSTMoutput3.expsWhf[1, 1], LSTMoutput3.dh[256, 640])
-Validating --> LSTMoutput3.Whfdh = Times(LSTMoutput3.Whf[1024, 256], LSTMoutput3.unnamed263[256, 640])
-Validating --> LSTMoutput3.unnamed266 = Plus(LSTMoutput3.unnamed267[1024, 640], LSTMoutput3.Whfdh[1024, 640])
-Validating --> LSTMoutput3.dc = PastValue(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed264 = Scale(LSTMoutput3.expsWcf[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.Wcfdc = DiagTimes(LSTMoutput3.Wcf[1024, 1], LSTMoutput3.unnamed264[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed265 = Plus(LSTMoutput3.unnamed266[1024, 640], LSTMoutput3.Wcfdc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.ft = Sigmoid(LSTMoutput3.unnamed265[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.bft = ElementTimes(LSTMoutput3.ft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed252 = Scale(LSTMoutput3.expsWhi[1, 1], LSTMoutput3.dh[256, 640])
-Validating --> LSTMoutput3.Whidh = Times(LSTMoutput3.Whi[1024, 256], LSTMoutput3.unnamed252[256, 640])
-Validating --> LSTMoutput3.unnamed255 = Plus(LSTMoutput3.unnamed256[1024, 640], LSTMoutput3.Whidh[1024, 640])
-Validating --> LSTMoutput3.unnamed253 = Scale(LSTMoutput3.expsWci[1, 1], LSTMoutput3.dc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.Wcidc = DiagTimes(LSTMoutput3.Wci[1024, 1], LSTMoutput3.unnamed253[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed254 = Plus(LSTMoutput3.unnamed255[1024, 640], LSTMoutput3.Wcidc[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.it = Sigmoid(LSTMoutput3.unnamed254[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed258 = Scale(LSTMoutput3.expsWhc[1, 1], LSTMoutput3.dh[256, 640])
-Validating --> LSTMoutput3.Whcdh = Times(LSTMoutput3.Whc[1024, 256], LSTMoutput3.unnamed258[256, 640])
-Validating --> LSTMoutput3.unnamed261 = Plus(LSTMoutput3.Whcdh[1024, 640], LSTMoutput3.bc[1024, 1])
-Validating --> LSTMoutput3.unnamed260 = Plus(LSTMoutput3.Wxcx[1024, 640], LSTMoutput3.unnamed261[1024, 640])
-Validating --> LSTMoutput3.unnamed259 = Tanh(LSTMoutput3.unnamed260[1024, 640])
-Validating --> LSTMoutput3.bit = ElementTimes(LSTMoutput3.it[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput3.unnamed259[1024, 640])
-Validating --> LSTMoutput3.ct = Plus(LSTMoutput3.bft[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput3.bit[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed270 = Scale(LSTMoutput3.expsWco[1, 1], LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.Wcoct = DiagTimes(LSTMoutput3.Wco[1024, 1], LSTMoutput3.unnamed270[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed271 = Plus(LSTMoutput3.unnamed272[1024, 640], LSTMoutput3.Wcoct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.ot = Sigmoid(LSTMoutput3.unnamed271[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed274 = Tanh(LSTMoutput3.ct[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.mt = ElementTimes(LSTMoutput3.ot[1024 {W=3452816845, H=3452816845, C=3452816845}, 640], LSTMoutput3.unnamed274[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.unnamed275 = Scale(LSTMoutput3.expsWmr[1, 1], LSTMoutput3.mt[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> LSTMoutput3.output = Times(LSTMoutput3.Wmr[256, 1024], LSTMoutput3.unnamed275[1024 {W=3452816845, H=3452816845, C=3452816845}, 640])
-Validating --> unnamed284 = Scale(expsW[1, 1], LSTMoutput3.output[256, 640])
-Validating --> unnamed283 = Times(W[132, 256], unnamed284[256, 640])
-Validating --> b = LearnableParameter
-Validating --> LSTMoutputW = Plus(unnamed283[132, 640], b[132, 1])
-Validating --> Err = ErrorPrediction(labels[132, 640], LSTMoutputW[132, 640])
-
- Epoch[ 1 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.78817415; EvalErr[0]PerSample = 0.89125001; TotalTime = 17.48173s; TotalTimePerSample = 2.73152ms; SamplesPerSecond = 366
- Epoch[ 1 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.59419441; EvalErr[0]PerSample = 0.86328125; TotalTime = 18.07901s; TotalTimePerSample = 2.82485ms; SamplesPerSecond = 354
- Epoch[ 1 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.52217722; EvalErr[0]PerSample = 0.81859374; TotalTime = 15.52239s; TotalTimePerSample = 2.42537ms; SamplesPerSecond = 412
-Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 4.5853896; EvalErrPerSample = 0.84082031; Ave LearnRatePerSample = 0.0007812500116; EpochTime=54.814574
-Starting Epoch 2: learning rate per sample = 0.000781  momentum = 0.899991 
+ Epoch[ 1 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.78817383; EvalErr[0]PerSample = 0.89125000; TotalTime = 20.56791s; TotalTimePerSample = 3.21374ms; SamplesPerSecond = 311
+ Epoch[ 1 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.59419434; EvalErr[0]PerSample = 0.86328125; TotalTime = 20.21589s; TotalTimePerSample = 3.15873ms; SamplesPerSecond = 316
+ Epoch[ 1 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.52217773; EvalErr[0]PerSample = 0.81859375; TotalTime = 22.25449s; TotalTimePerSample = 3.47726ms; SamplesPerSecond = 287
+Finished Epoch[ 1 of 4]: [Training Set] TrainLossPerSample = 4.5853896; EvalErrPerSample = 0.84082031; Ave LearnRatePerSample = 0.0007812500116; EpochTime=68.996574
+Starting Epoch 2: learning rate per sample = 0.000781  effective momentum = 0.900000 
 minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20632), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 2 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.29597616; EvalErr[0]PerSample = 0.82859373; TotalTime = 16.34016s; TotalTimePerSample = 2.55315ms; SamplesPerSecond = 391
- Epoch[ 2 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.27295351; EvalErr[0]PerSample = 0.87312502; TotalTime = 17.48450s; TotalTimePerSample = 2.73195ms; SamplesPerSecond = 366
- Epoch[ 2 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.95423460; EvalErr[0]PerSample = 0.82499999; TotalTime = 17.16935s; TotalTimePerSample = 2.68271ms; SamplesPerSecond = 372
-Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 4.1132793; EvalErrPerSample = 0.83588868; Ave LearnRatePerSample = 0.0007812500116; EpochTime=55.11008
-Starting Epoch 3: learning rate per sample = 0.000781  momentum = 0.899991 
+ Epoch[ 2 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.29597595; EvalErr[0]PerSample = 0.82859375; TotalTime = 20.96682s; TotalTimePerSample = 3.27607ms; SamplesPerSecond = 305
+ Epoch[ 2 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.27295776; EvalErr[0]PerSample = 0.87312500; TotalTime = 20.34551s; TotalTimePerSample = 3.17899ms; SamplesPerSecond = 314
+ Epoch[ 2 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.95423523; EvalErr[0]PerSample = 0.82500000; TotalTime = 20.81350s; TotalTimePerSample = 3.25211ms; SamplesPerSecond = 307
+Finished Epoch[ 2 of 4]: [Training Set] TrainLossPerSample = 4.1132798; EvalErrPerSample = 0.83588868; Ave LearnRatePerSample = 0.0007812500116; EpochTime=66.807404
+Starting Epoch 3: learning rate per sample = 0.000781  effective momentum = 0.900000 
 minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40962), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 3 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.17982197; EvalErr[0]PerSample = 0.85281253; TotalTime = 16.15247s; TotalTimePerSample = 2.52382ms; SamplesPerSecond = 396
- Epoch[ 3 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.16644049; EvalErr[0]PerSample = 0.86703128; TotalTime = 15.53962s; TotalTimePerSample = 2.42807ms; SamplesPerSecond = 411
- Epoch[ 3 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.95540762; EvalErr[0]PerSample = 0.83859372; TotalTime = 18.71239s; TotalTimePerSample = 2.92381ms; SamplesPerSecond = 342
-Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 4.0661387; EvalErrPerSample = 0.84653324; Ave LearnRatePerSample = 0.0007812500116; EpochTime=54.14235
-Starting Epoch 4: learning rate per sample = 0.000781  momentum = 0.899991 
+ Epoch[ 3 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.17982239; EvalErr[0]PerSample = 0.85281250; TotalTime = 18.89055s; TotalTimePerSample = 2.95165ms; SamplesPerSecond = 338
+ Epoch[ 3 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.16644226; EvalErr[0]PerSample = 0.86703125; TotalTime = 20.64840s; TotalTimePerSample = 3.22631ms; SamplesPerSecond = 309
+ Epoch[ 3 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.95540649; EvalErr[0]PerSample = 0.83859375; TotalTime = 20.57245s; TotalTimePerSample = 3.21444ms; SamplesPerSecond = 311
+Finished Epoch[ 3 of 4]: [Training Set] TrainLossPerSample = 4.0661392; EvalErrPerSample = 0.84653324; Ave LearnRatePerSample = 0.0007812500116; EpochTime=64.052172
+Starting Epoch 4: learning rate per sample = 0.000781  effective momentum = 0.900000 
 minibatchiterator: epoch 3: frames [61440..81920] (first utterance at frame 61554), data subset 0 of 1, with 1 datapasses
 
 Starting minibatch loop.
- Epoch[ 4 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.06800747; EvalErr[0]PerSample = 0.82734376; TotalTime = 17.96433s; TotalTimePerSample = 2.80693ms; SamplesPerSecond = 356
- Epoch[ 4 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.10716391; EvalErr[0]PerSample = 0.88249999; TotalTime = 15.48745s; TotalTimePerSample = 2.41991ms; SamplesPerSecond = 413
- Epoch[ 4 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.91763616; EvalErr[0]PerSample = 0.82390624; TotalTime = 16.49760s; TotalTimePerSample = 2.57775ms; SamplesPerSecond = 387
-Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 3.9796886; EvalErrPerSample = 0.82807618; Ave LearnRatePerSample = 0.0007812500116; EpochTime=63.545066
+ Epoch[ 4 of 4]-Minibatch[   1-  10 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.06800842; EvalErr[0]PerSample = 0.82734375; TotalTime = 18.79745s; TotalTimePerSample = 2.93710ms; SamplesPerSecond = 340
+ Epoch[ 4 of 4]-Minibatch[  11-  20 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  4.10716370; EvalErr[0]PerSample = 0.88250000; TotalTime = 18.98044s; TotalTimePerSample = 2.96569ms; SamplesPerSecond = 337
+ Epoch[ 4 of 4]-Minibatch[  21-  30 of 1024]: SamplesSeen = 6400; TrainLossPerSample =  3.91763550; EvalErr[0]PerSample = 0.82390625; TotalTime = 18.64471s; TotalTimePerSample = 2.91324ms; SamplesPerSecond = 343
+Finished Epoch[ 4 of 4]: [Training Set] TrainLossPerSample = 3.9796886; EvalErrPerSample = 0.82807618; Ave LearnRatePerSample = 0.0007812500116; EpochTime=60.617335
+CNTKCommandTrainEnd: speechTrain
 COMPLETED
diff --git a/Tests/Speech/QuickE2E/baseline.gpu.txt b/Tests/Speech/QuickE2E/baseline.gpu.txt
index e66a6fb45..f37d41184 100644
--- a/Tests/Speech/QuickE2E/baseline.gpu.txt
+++ b/Tests/Speech/QuickE2E/baseline.gpu.txt
@@ -1,7 +1,7 @@
-=== Running /home/vlivan/cntk/bin/x86_64.gpu.release.acml/cntk configFile=/home/vlivan/cntk/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu DataDir=/home/vlivan/cntk/Tests/Speech/Data DeviceId=Auto
-running on localhost at 2015/07/29 19:11:08
-command line options: 
-configFile=/home/vlivan/cntk/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu DataDir=/home/vlivan/cntk/Tests/Speech/Data DeviceId=Auto 
+=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E DeviceId=0
+running on localhost at 2015/10/24 12:49:00
+command line: 
+/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E DeviceId=0 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
@@ -23,6 +23,31 @@ speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -61,21 +86,22 @@ speechTrain=[
       ]
     ]
 ]
-RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu
-DataDir=/home/vlivan/cntk/Tests/Speech/Data
-DeviceId=Auto
+RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu
+DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E
+DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 precision=float
 command=speechTrain
-deviceId=Auto
+deviceId=0
 parallelTrain=false
 speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
-    deviceId=Auto
+    modelPath=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+    deviceId=0
     traceLevel=1
     SimpleNetworkBuilder=[
         layerSizes=363:512:512:132
@@ -87,6 +113,31 @@ speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -118,30 +169,32 @@ speechTrain=[
           scpFile=glob_0000.scp
       ]
       labels=[
-          mlfFile=/home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf
-          labelMappingFile=/home/vlivan/cntk/Tests/Speech/Data/state.list
+          mlfFile=/home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf
+          labelMappingFile=/home/mluser/src/cplx_master/Tests/Speech/Data/state.list
           labelDim=132
           labelType=Category
       ]
     ]
 ]
-RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu
-DataDir=/home/vlivan/cntk/Tests/Speech/Data
-DeviceId=Auto
+RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu
+DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E
+DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: cntk.config:command=speechTrain
-configparameters: cntk.config:DataDir=/home/vlivan/cntk/Tests/Speech/Data
-configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E
+configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+configparameters: cntk.config:deviceId=0
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu
+configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu
 configparameters: cntk.config:speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
-    deviceId=Auto
+    modelPath=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+    deviceId=0
     traceLevel=1
     SimpleNetworkBuilder=[
         layerSizes=363:512:512:132
@@ -153,6 +206,31 @@ configparameters: cntk.config:speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -184,8 +262,8 @@ configparameters: cntk.config:speechTrain=[
           scpFile=glob_0000.scp
       ]
       labels=[
-          mlfFile=/home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf
-          labelMappingFile=/home/vlivan/cntk/Tests/Speech/Data/state.list
+          mlfFile=/home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf
+          labelMappingFile=/home/mluser/src/cplx_master/Tests/Speech/Data/state.list
           labelDim=132
           labelType=Category
       ]
@@ -195,178 +273,293 @@ configparameters: cntk.config:speechTrain=[
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: speechTrain 
 precision = float
-lsof: WARNING: can't stat() ext4 file system /var/lib/docker/aufs
-      Output information may be incomplete.
-LockDevice: Capture device 0 and lock it for exclusive use
-LockDevice: Capture device 0 and lock it for exclusive use
+CNTKModelPath: /tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 3
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+CNTKCommandTrainBegin: speechTrain
 SimpleNetworkBuilder Using GPU 0
 reading script file glob_0000.scp ... 948 entries
-total 132 state names in state list /home/vlivan/cntk/Tests/Speech/Data/state.list
-htkmlfreader: reading MLF file /home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf ...parse the line 55130
- total 948 entries
+trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
+total 132 state names in state list /home/mluser/src/cplx_master/Tests/Speech/Data/state.list
+htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+SetUniformRandomValue (GPU): creating curand object with seed 1
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
 
 
-Validating node CrossEntropyWithSoftmax 
+Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 3])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 3])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 3])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 3], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 3])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 3])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 3], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 3])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 3])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 3], B2[132, 1])
-Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 3], HLast[132, 3])
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> 3 PreCompute nodes found.
 
-Found 3 PreCompute nodes
 	NodeName: InvStdOfFeatures
 	NodeName: MeanOfFeatures
 	NodeName: Prior
-minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0) with 1 datapasses
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
 
-Validating node InvStdOfFeatures 
+Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 
-Validating --> features = InputValue
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 64])
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+
+Validating for node InvStdOfFeatures, final verification.
+
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+
+1 out of 2 nodes do not share the minibatch layout with the input data.
 
 
 
-Validating node MeanOfFeatures 
+Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 64])
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+
+Validating for node MeanOfFeatures, final verification.
+
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+
+1 out of 2 nodes do not share the minibatch layout with the input data.
 
 
 
-Validating node Prior 
+Validating for node Prior. 2 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> Prior = Mean(labels[132, 64])
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+
+Validating for node Prior. 1 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+
+Validating for node Prior, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+
+1 out of 2 nodes do not share the minibatch layout with the input data.
+
+EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+
+Precomputing --> Completed.
 
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
-minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0) with 1 datapasses
+Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000 
+minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 
 
-Validating node EvalErrorPrediction 
+Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 64])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 64])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 64], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 64])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 64], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 64])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 64])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 64], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 64])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 64])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 64], B2[132, 1])
-Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 64], HLast[132, 64])
+Validating --> labels = InputValue -> [132, MBSize 62]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 62]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 
- Epoch[1 of 3]-Minibatch[1-10 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.3213539; EvalErr[0]PerSample = 0.89999998; TotalTime=0.064294; TotalTimePerSample=0.00010045938, SamplesPerSecond=9954
- Epoch[1 of 3]-Minibatch[11-20 of 320]: SamplesSeen = 640; TrainLossPerSample = 4.1507101; EvalErr[0]PerSample = 0.8671875; TotalTime=0.055813; TotalTimePerSample=8.7207812e-05, SamplesPerSecond=11466
- Epoch[1 of 3]-Minibatch[21-30 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.9990096; EvalErr[0]PerSample = 0.87656248; TotalTime=0.062703; TotalTimePerSample=9.7973437e-05, SamplesPerSecond=10206
- Epoch[1 of 3]-Minibatch[31-40 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.8694596; EvalErr[0]PerSample = 0.87656248; TotalTime=0.059923; TotalTimePerSample=9.3629687e-05, SamplesPerSecond=10680
- Epoch[1 of 3]-Minibatch[41-50 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.8021927; EvalErr[0]PerSample = 0.87812501; TotalTime=0.061061; TotalTimePerSample=9.5407812e-05, SamplesPerSecond=10481
- Epoch[1 of 3]-Minibatch[51-60 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.7289093; EvalErr[0]PerSample = 0.86874998; TotalTime=0.062101; TotalTimePerSample=9.7032813e-05, SamplesPerSecond=10305
- Epoch[1 of 3]-Minibatch[61-70 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.5618699; EvalErr[0]PerSample = 0.82343751; TotalTime=0.056094; TotalTimePerSample=8.7646875e-05, SamplesPerSecond=11409
- Epoch[1 of 3]-Minibatch[71-80 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.4279053; EvalErr[0]PerSample = 0.80781251; TotalTime=0.063459; TotalTimePerSample=9.9154687e-05, SamplesPerSecond=10085
- Epoch[1 of 3]-Minibatch[81-90 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.3392854; EvalErr[0]PerSample = 0.7734375; TotalTime=0.062265; TotalTimePerSample=9.7289063e-05, SamplesPerSecond=10278
- Epoch[1 of 3]-Minibatch[91-100 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.3639894; EvalErr[0]PerSample = 0.84375; TotalTime=0.059843; TotalTimePerSample=9.3504687e-05, SamplesPerSecond=10694
+Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 62]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 62]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+
+Validating for node EvalErrorPrediction, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 62]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 62]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
+
+
+Starting minibatch loop.
+ Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.32135277; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.05742s; TotalTimePerSample = 0.08972ms; SamplesPerSecond = 11145
+ Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.15070992; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.05557s; TotalTimePerSample = 0.08682ms; SamplesPerSecond = 11517
+ Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.99901123; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.05549s; TotalTimePerSample = 0.08671ms; SamplesPerSecond = 11532
+ Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.86945953; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.05588s; TotalTimePerSample = 0.08732ms; SamplesPerSecond = 11452
+ Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.80219574; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.05549s; TotalTimePerSample = 0.08670ms; SamplesPerSecond = 11534
+ Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72890930; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.05552s; TotalTimePerSample = 0.08675ms; SamplesPerSecond = 11526
+ Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.56186981; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.05571s; TotalTimePerSample = 0.08705ms; SamplesPerSecond = 11488
+ Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.42790527; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.05550s; TotalTimePerSample = 0.08672ms; SamplesPerSecond = 11531
+ Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33928528; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.05557s; TotalTimePerSample = 0.08683ms; SamplesPerSecond = 11517
+ Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.36398926; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.05550s; TotalTimePerSample = 0.08671ms; SamplesPerSecond = 11532
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[1 of 3]-Minibatch[101-110 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.2122345; EvalErr[0]PerSample = 0.75312501; TotalTime=0.062375; TotalTimePerSample=9.7460937e-05, SamplesPerSecond=10260
- Epoch[1 of 3]-Minibatch[111-120 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.3126526; EvalErr[0]PerSample = 0.78750002; TotalTime=0.061085; TotalTimePerSample=9.5445313e-05, SamplesPerSecond=10477
- Epoch[1 of 3]-Minibatch[121-130 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.1408203; EvalErr[0]PerSample = 0.74687499; TotalTime=0.064562; TotalTimePerSample=0.00010087812, SamplesPerSecond=9912
- Epoch[1 of 3]-Minibatch[131-140 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.006897; EvalErr[0]PerSample = 0.69687498; TotalTime=0.0575; TotalTimePerSample=8.984375e-05, SamplesPerSecond=11130
- Epoch[1 of 3]-Minibatch[141-150 of 320]: SamplesSeen = 640; TrainLossPerSample = 3.0049591; EvalErr[0]PerSample = 0.72343749; TotalTime=0.058338; TotalTimePerSample=9.1153125e-05, SamplesPerSecond=10970
- Epoch[1 of 3]-Minibatch[151-160 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.9785829; EvalErr[0]PerSample = 0.73906249; TotalTime=0.064603; TotalTimePerSample=0.00010094219, SamplesPerSecond=9906
- Epoch[1 of 3]-Minibatch[161-170 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.8568604; EvalErr[0]PerSample = 0.70781249; TotalTime=0.060368; TotalTimePerSample=9.4325e-05, SamplesPerSecond=10601
- Epoch[1 of 3]-Minibatch[171-180 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.6905334; EvalErr[0]PerSample = 0.671875; TotalTime=0.059125; TotalTimePerSample=9.2382812e-05, SamplesPerSecond=10824
- Epoch[1 of 3]-Minibatch[181-190 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.7865357; EvalErr[0]PerSample = 0.70468748; TotalTime=0.056113; TotalTimePerSample=8.7676563e-05, SamplesPerSecond=11405
- Epoch[1 of 3]-Minibatch[191-200 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.5770202; EvalErr[0]PerSample = 0.6484375; TotalTime=0.060745; TotalTimePerSample=9.4914062e-05, SamplesPerSecond=10535
- Epoch[1 of 3]-Minibatch[201-210 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.6157165; EvalErr[0]PerSample = 0.6640625; TotalTime=0.059709; TotalTimePerSample=9.3295312e-05, SamplesPerSecond=10718
- Epoch[1 of 3]-Minibatch[211-220 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.552362; EvalErr[0]PerSample = 0.65781248; TotalTime=0.061917; TotalTimePerSample=9.6745313e-05, SamplesPerSecond=10336
- Epoch[1 of 3]-Minibatch[221-230 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.4821167; EvalErr[0]PerSample = 0.625; TotalTime=0.053813; TotalTimePerSample=8.4082813e-05, SamplesPerSecond=11893
- Epoch[1 of 3]-Minibatch[231-240 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.3877869; EvalErr[0]PerSample = 0.62812501; TotalTime=0.061932; TotalTimePerSample=9.676875e-05, SamplesPerSecond=10333
- Epoch[1 of 3]-Minibatch[241-250 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.3690064; EvalErr[0]PerSample = 0.6484375; TotalTime=0.059294; TotalTimePerSample=9.2646875e-05, SamplesPerSecond=10793
- Epoch[1 of 3]-Minibatch[251-260 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.4396729; EvalErr[0]PerSample = 0.6328125; TotalTime=0.060513; TotalTimePerSample=9.4551562e-05, SamplesPerSecond=10576
- Epoch[1 of 3]-Minibatch[261-270 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.3028197; EvalErr[0]PerSample = 0.61250001; TotalTime=0.06037; TotalTimePerSample=9.4328125e-05, SamplesPerSecond=10601
- Epoch[1 of 3]-Minibatch[271-280 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.1966858; EvalErr[0]PerSample = 0.55937499; TotalTime=0.056485; TotalTimePerSample=8.8257812e-05, SamplesPerSecond=11330
- Epoch[1 of 3]-Minibatch[281-290 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.2898011; EvalErr[0]PerSample = 0.60468751; TotalTime=0.059356; TotalTimePerSample=9.274375e-05, SamplesPerSecond=10782
- Epoch[1 of 3]-Minibatch[291-300 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.1775086; EvalErr[0]PerSample = 0.62187499; TotalTime=0.059501; TotalTimePerSample=9.2970312e-05, SamplesPerSecond=10756
- Epoch[1 of 3]-Minibatch[301-310 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.2626343; EvalErr[0]PerSample = 0.59687501; TotalTime=0.064342; TotalTimePerSample=0.00010053437, SamplesPerSecond=9946
- Epoch[1 of 3]-Minibatch[311-320 of 320]: SamplesSeen = 640; TrainLossPerSample = 2.1507263; EvalErr[0]PerSample = 0.5625; TotalTime=0.064522; TotalTimePerSample=0.00010081563, SamplesPerSecond=9919
-Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799569; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=1.935613
-Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
-minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480) with 1 datapasses
- Epoch[2 of 3]-Minibatch[1-10 of 80]: SamplesSeen = 2560; TrainLossPerSample = 2.0159853; EvalErr[0]PerSample = 0.54140627; TotalTime=0.102487; TotalTimePerSample=4.0033984e-05, SamplesPerSecond=24978
- Epoch[2 of 3]-Minibatch[11-20 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9881856; EvalErr[0]PerSample = 0.54296875; TotalTime=0.09473; TotalTimePerSample=3.7003906e-05, SamplesPerSecond=27024
- Epoch[2 of 3]-Minibatch[21-30 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9869812; EvalErr[0]PerSample = 0.54140627; TotalTime=0.091318; TotalTimePerSample=3.5671094e-05, SamplesPerSecond=28033
- Epoch[2 of 3]-Minibatch[31-40 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9312614; EvalErr[0]PerSample = 0.5277344; TotalTime=0.092408; TotalTimePerSample=3.6096875e-05, SamplesPerSecond=27703
- Epoch[2 of 3]-Minibatch[41-50 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9006774; EvalErr[0]PerSample = 0.52656251; TotalTime=0.098698; TotalTimePerSample=3.8553906e-05, SamplesPerSecond=25937
- Epoch[2 of 3]-Minibatch[51-60 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.9711578; EvalErr[0]PerSample = 0.54140627; TotalTime=0.0896; TotalTimePerSample=3.5e-05, SamplesPerSecond=28571
- Epoch[2 of 3]-Minibatch[61-70 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.8951813; EvalErr[0]PerSample = 0.52031249; TotalTime=0.092477; TotalTimePerSample=3.6123828e-05, SamplesPerSecond=27682
- Epoch[2 of 3]-Minibatch[71-80 of 80]: SamplesSeen = 2560; TrainLossPerSample = 1.904506; EvalErr[0]PerSample = 0.53164065; TotalTime=0.091179; TotalTimePerSample=3.5616797e-05, SamplesPerSecond=28076
-Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.949242; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=0.753703
-Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
-minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses
- Epoch[3 of 3]-Minibatch[1-10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.8735985; EvalErr[0]PerSample = 0.51933593; TotalTime=0.27395; TotalTimePerSample=2.675293e-05, SamplesPerSecond=37379
- Epoch[3 of 3]-Minibatch[11-20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.8665626; EvalErr[0]PerSample = 0.51748049; TotalTime=0.261453; TotalTimePerSample=2.553252e-05, SamplesPerSecond=39165
-Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.51840824; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.537273
+ Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.21223450; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.05582s; TotalTimePerSample = 0.08723ms; SamplesPerSecond = 11464
+ Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.31265259; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.05591s; TotalTimePerSample = 0.08736ms; SamplesPerSecond = 11446
+ Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.14082031; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.05556s; TotalTimePerSample = 0.08680ms; SamplesPerSecond = 11520
+ Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00689697; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.05566s; TotalTimePerSample = 0.08696ms; SamplesPerSecond = 11499
+ Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.00496216; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.05562s; TotalTimePerSample = 0.08690ms; SamplesPerSecond = 11506
+ Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.97858887; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.05559s; TotalTimePerSample = 0.08687ms; SamplesPerSecond = 11512
+ Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.85686035; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.05570s; TotalTimePerSample = 0.08703ms; SamplesPerSecond = 11490
+ Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.69053345; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.05565s; TotalTimePerSample = 0.08695ms; SamplesPerSecond = 11501
+ Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.78653564; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.05552s; TotalTimePerSample = 0.08674ms; SamplesPerSecond = 11528
+ Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.57702026; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.05548s; TotalTimePerSample = 0.08669ms; SamplesPerSecond = 11535
+ Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.61571655; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.05545s; TotalTimePerSample = 0.08663ms; SamplesPerSecond = 11542
+ Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.55236206; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.05567s; TotalTimePerSample = 0.08698ms; SamplesPerSecond = 11496
+ Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.48211670; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.05560s; TotalTimePerSample = 0.08688ms; SamplesPerSecond = 11510
+ Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.38778687; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.05546s; TotalTimePerSample = 0.08666ms; SamplesPerSecond = 11539
+ Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.36900635; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.05560s; TotalTimePerSample = 0.08687ms; SamplesPerSecond = 11511
+ Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.43967285; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.05553s; TotalTimePerSample = 0.08677ms; SamplesPerSecond = 11524
+ Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30281982; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.05553s; TotalTimePerSample = 0.08677ms; SamplesPerSecond = 11525
+ Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.19668579; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.05553s; TotalTimePerSample = 0.08677ms; SamplesPerSecond = 11525
+ Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.28980103; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.05551s; TotalTimePerSample = 0.08674ms; SamplesPerSecond = 11529
+ Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.17750854; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.05574s; TotalTimePerSample = 0.08709ms; SamplesPerSecond = 11482
+ Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.26263428; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.05555s; TotalTimePerSample = 0.08679ms; SamplesPerSecond = 11521
+ Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15072632; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.05427s; TotalTimePerSample = 0.08479ms; SamplesPerSecond = 11793
+Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799573; EvalErrPerSample = 0.72216797; Ave LearnRatePerSample = 0.015625; EpochTime=1.785537
+Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119 
+minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.09354s; TotalTimePerSample = 0.03654ms; SamplesPerSecond = 27367
+ Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98818569; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.09083s; TotalTimePerSample = 0.03548ms; SamplesPerSecond = 28184
+ Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98698120; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.09109s; TotalTimePerSample = 0.03558ms; SamplesPerSecond = 28103
+ Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.93126144; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.09077s; TotalTimePerSample = 0.03546ms; SamplesPerSecond = 28203
+ Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90067749; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.09081s; TotalTimePerSample = 0.03547ms; SamplesPerSecond = 28191
+ Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.97115784; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.09085s; TotalTimePerSample = 0.03549ms; SamplesPerSecond = 28179
+ Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89518127; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.09092s; TotalTimePerSample = 0.03552ms; SamplesPerSecond = 28155
+ Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90450592; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.08529s; TotalTimePerSample = 0.03332ms; SamplesPerSecond = 30014
+Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.949242; EvalErrPerSample = 0.53417969; Ave LearnRatePerSample = 0.001953125; EpochTime=0.732528
+Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87359848; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.24564s; TotalTimePerSample = 0.02399ms; SamplesPerSecond = 41687
+ Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86656265; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.21814s; TotalTimePerSample = 0.02130ms; SamplesPerSecond = 46943
+Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.51840824; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.493964
+CNTKCommandTrainEnd: speechTrain
 COMPLETED
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-WARNING:
-
-You should always run with libnvidia-ml.so that is installed with your
-NVIDIA Display Driver. By default it's installed in /usr/lib and /usr/lib64.
-libnvidia-ml.so in GDK package is a stub library that is attached only for
-build purposes (e.g. machine that you build your application doesn't have
-to have Display Driver installed).
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-Linked to libnvidia-ml library at wrong path : /usr/src/gdk/nvml/lib/libnvidia-ml.so.1
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-WARNING:
-
-You should always run with libnvidia-ml.so that is installed with your
-NVIDIA Display Driver. By default it's installed in /usr/lib and /usr/lib64.
-libnvidia-ml.so in GDK package is a stub library that is attached only for
-build purposes (e.g. machine that you build your application doesn't have
-to have Display Driver installed).
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 === Deleting last epoch data
 ==== Re-running from checkpoint
-running on localhost at 2015/07/29 19:11:14
-command line options: 
-configFile=/home/vlivan/cntk/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu DataDir=/home/vlivan/cntk/Tests/Speech/Data DeviceId=Auto 
+=== Running /home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E DeviceId=0
+running on localhost at 2015/10/24 12:49:11
+command line: 
+/home/mluser/src/cplx_master/build/debug/bin/cntk configFile=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E/cntk.config RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E DeviceId=0 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
@@ -388,6 +581,31 @@ speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -426,21 +644,22 @@ speechTrain=[
       ]
     ]
 ]
-RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu
-DataDir=/home/vlivan/cntk/Tests/Speech/Data
-DeviceId=Auto
+RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu
+DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E
+DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 precision=float
 command=speechTrain
-deviceId=Auto
+deviceId=0
 parallelTrain=false
 speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
-    deviceId=Auto
+    modelPath=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+    deviceId=0
     traceLevel=1
     SimpleNetworkBuilder=[
         layerSizes=363:512:512:132
@@ -452,6 +671,31 @@ speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -483,30 +727,32 @@ speechTrain=[
           scpFile=glob_0000.scp
       ]
       labels=[
-          mlfFile=/home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf
-          labelMappingFile=/home/vlivan/cntk/Tests/Speech/Data/state.list
+          mlfFile=/home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf
+          labelMappingFile=/home/mluser/src/cplx_master/Tests/Speech/Data/state.list
           labelDim=132
           labelType=Category
       ]
     ]
 ]
-RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu
-DataDir=/home/vlivan/cntk/Tests/Speech/Data
-DeviceId=Auto
+RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu
+DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E
+DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: cntk.config:command=speechTrain
-configparameters: cntk.config:DataDir=/home/vlivan/cntk/Tests/Speech/Data
-configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ConfigDir=/home/mluser/src/cplx_master/Tests/Speech/QuickE2E
+configparameters: cntk.config:DataDir=/home/mluser/src/cplx_master/Tests/Speech/Data
+configparameters: cntk.config:deviceId=0
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu
+configparameters: cntk.config:RunDir=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu
 configparameters: cntk.config:speechTrain=[
     action=train
-    modelPath=/tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
-    deviceId=Auto
+    modelPath=/tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+    deviceId=0
     traceLevel=1
     SimpleNetworkBuilder=[
         layerSizes=363:512:512:132
@@ -518,6 +764,31 @@ configparameters: cntk.config:speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -549,8 +820,8 @@ configparameters: cntk.config:speechTrain=[
           scpFile=glob_0000.scp
       ]
       labels=[
-          mlfFile=/home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf
-          labelMappingFile=/home/vlivan/cntk/Tests/Speech/Data/state.list
+          mlfFile=/home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf
+          labelMappingFile=/home/mluser/src/cplx_master/Tests/Speech/Data/state.list
           labelDim=132
           labelType=Category
       ]
@@ -560,24 +831,24 @@ configparameters: cntk.config:speechTrain=[
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: speechTrain 
 precision = float
-lsof: WARNING: can't stat() ext4 file system /var/lib/docker/aufs
-      Output information may be incomplete.
-LockDevice: Capture device 0 and lock it for exclusive use
-LockDevice: Capture device 0 and lock it for exclusive use
+CNTKModelPath: /tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 3
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+CNTKCommandTrainBegin: speechTrain
 SimpleNetworkBuilder Using GPU 0
 reading script file glob_0000.scp ... 948 entries
-total 132 state names in state list /home/vlivan/cntk/Tests/Speech/Data/state.list
-htkmlfreader: reading MLF file /home/vlivan/cntk/Tests/Speech/Data/glob_0000.mlf ...parse the line 55130
- total 948 entries
+trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
+total 132 state names in state list /home/mluser/src/cplx_master/Tests/Speech/Data/state.list
+htkmlfreader: reading MLF file /home/mluser/src/cplx_master/Tests/Speech/Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
-Starting from checkpoint. Load Network From File /tmp/cntk-test-20150729191101.973007/Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn.2.
+Starting from checkpoint. Load Network From File /tmp/cntk-test-20151024124900.548963/Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn.2.
 
 
 Printing Gradient Computation Node Order ... 
 
-CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 256], HLast[0, 0])
+CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 0], HLast[0, 0])
 HLast[0, 0] = Plus(W2*H1[0, 0], B2[132, 1])
 B2[132, 1] = LearnableParameter
 W2*H1[0, 0] = Times(W2[132, 512], H2[0, 0])
@@ -589,170 +860,475 @@ H1[0, 0] = Sigmoid(W0*features+B0[0, 0])
 W0*features+B0[0, 0] = Plus(W0*features[0, 0], B0[512, 1])
 B0[512, 1] = LearnableParameter
 W0*features[0, 0] = Times(W0[512, 363], MVNormalizedFeatures[0, 0])
-MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-InvStdOfFeatures[363, 1] = InvStdDev(features[363, 256])
-MeanOfFeatures[363, 1] = Mean(features[363, 256])
-features[363, 256] = InputValue
+MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+InvStdOfFeatures[363, 1] = InvStdDev(features[363, 0])
+MeanOfFeatures[363, 1] = Mean(features[363, 0])
+features[363, 0] = InputValue
 W0[512, 363] = LearnableParameter
 W1[512, 512] = LearnableParameter
 W2[132, 512] = LearnableParameter
-labels[132, 256] = InputValue
+labels[132, 0] = InputValue
 
-Validating node CrossEntropyWithSoftmax 
+Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 256])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
-Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256])
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
 
 
 
-Validating node ScaledLogLikelihood 
+Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 256])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
-Validating --> labels = InputValue
-Validating --> Prior = Mean(labels[132, 256])
-Validating --> LogOfPrior = Log(Prior[132, 1])
-Validating --> ScaledLogLikelihood = Minus(HLast[132, 256], LogOfPrior[132, 1])
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
 
 
 
-Validating node EvalErrorPrediction 
+Validating for node ScaledLogLikelihood. 22 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 256])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
-Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 256], HLast[132, 256])
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+Validating for node ScaledLogLikelihood. 11 nodes to process in pass 2.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+Validating for node ScaledLogLikelihood, final verification.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+10 out of 22 nodes do not share the minibatch layout with the input data.
+
+
+
+Validating for node ScaledLogLikelihood. 22 nodes to process in pass 1.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+Validating for node ScaledLogLikelihood. 9 nodes to process in pass 2.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+Validating for node ScaledLogLikelihood, final verification.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+10 out of 22 nodes do not share the minibatch layout with the input data.
+
+
+
+Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node EvalErrorPrediction. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node EvalErrorPrediction, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
+
+
+
+Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node EvalErrorPrediction. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node EvalErrorPrediction, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
 
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
-
-
-Validating node CrossEntropyWithSoftmax 
-
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 256])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
-Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256])
-
 No PreCompute nodes found, skipping PreCompute step
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
-minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses
+Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
-
-Validating node EvalErrorPrediction 
-
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 1024])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 1024])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 1024], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 1024])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 1024], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 1024])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 1024])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 1024], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 1024])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 1024])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 1024], B2[132, 1])
-Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 1024], HLast[132, 1024])
-
- Epoch[3 of 3]-Minibatch[1-10 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.8735985; EvalErr[0]PerSample = 0.51933593; TotalTime=0.430752; TotalTimePerSample=4.2065625e-05, SamplesPerSecond=23772
- Epoch[3 of 3]-Minibatch[11-20 of 20]: SamplesSeen = 10240; TrainLossPerSample = 1.8665626; EvalErr[0]PerSample = 0.51748049; TotalTime=0.2702; TotalTimePerSample=2.6386719e-05, SamplesPerSecond=37897
-Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.51840824; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=0.868162
+Starting minibatch loop.
+EnforceOneGPUOnly: WARNING: Ignored attempt to change GPU choice from 0 now 1. This message will be shown only once.
+ Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87359848; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.32305s; TotalTimePerSample = 0.03155ms; SamplesPerSecond = 31698
+ Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86656265; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.21717s; TotalTimePerSample = 0.02121ms; SamplesPerSecond = 47152
+Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; EvalErrPerSample = 0.51840824; Ave LearnRatePerSample = 9.765625146e-05; EpochTime=1.439589
+CNTKCommandTrainEnd: speechTrain
 COMPLETED
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-WARNING:
-
-You should always run with libnvidia-ml.so that is installed with your
-NVIDIA Display Driver. By default it's installed in /usr/lib and /usr/lib64.
-libnvidia-ml.so in GDK package is a stub library that is attached only for
-build purposes (e.g. machine that you build your application doesn't have
-to have Display Driver installed).
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-Linked to libnvidia-ml library at wrong path : /usr/src/gdk/nvml/lib/libnvidia-ml.so.1
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-WARNING:
-
-You should always run with libnvidia-ml.so that is installed with your
-NVIDIA Display Driver. By default it's installed in /usr/lib and /usr/lib64.
-libnvidia-ml.so in GDK package is a stub library that is attached only for
-build purposes (e.g. machine that you build your application doesn't have
-to have Display Driver installed).
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
diff --git a/Tests/Speech/QuickE2E/baseline.windows.gpu.txt b/Tests/Speech/QuickE2E/baseline.windows.gpu.txt
index 5c336a37b..d47fb54a4 100644
--- a/Tests/Speech/QuickE2E/baseline.windows.gpu.txt
+++ b/Tests/Speech/QuickE2E/baseline.windows.gpu.txt
@@ -1,18 +1,16 @@
-=== Running /cygdrive/c/Users/svcphil/workspace.vlivan/CNTK-Build-Windows/x64/release/cntk.exe configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto
+=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E DeviceId=0
 -------------------------------------------------------------------
 Build info: 
 
-		Built time: Aug 11 2015 16:18:17
-		Last modified date: Tue Aug 11 16:16:08 2015
-		Built by svcphil on dphaim-26-new           
-		Build Path: C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\MachineLearning\CNTK\
+		Built time: Oct 24 2015 13:33:25
+		Last modified date: Thu Oct 22 16:00:27 2015
+		Built by amitaga on Amitaga-Win-DT3           
+		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
-		Build Branch: master
-		Build SHA1: 397cc7cc16c00b1c12864d331c0729fde7a1bde3
 -------------------------------------------------------------------
-running on dphaim-26-new at 2015/08/11 17:47:26
-command line options: 
-configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto 
+running on Amitaga-Win-DT3 at 2015/10/24 22:07:22
+command line: 
+E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E DeviceId=0 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
@@ -34,6 +32,31 @@ speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -72,21 +95,22 @@ speechTrain=[
       ]
     ]
 ]
-RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
-DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
-DeviceId=Auto
+RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E
+DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 precision=float
 command=speechTrain
-deviceId=Auto
+deviceId=0
 parallelTrain=false
 speechTrain=[
     action=train
-    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
-    deviceId=Auto
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+    deviceId=0
     traceLevel=1
     SimpleNetworkBuilder=[
         layerSizes=363:512:512:132
@@ -98,6 +122,31 @@ speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -129,30 +178,32 @@ speechTrain=[
           scpFile=glob_0000.scp
       ]
       labels=[
-          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
-          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
           labelDim=132
           labelType=Category
       ]
     ]
 ]
-RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
-DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
-DeviceId=Auto
+RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E
+DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: cntk.config:command=speechTrain
-configparameters: cntk.config:DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
-configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E
+configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+configparameters: cntk.config:deviceId=0
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
+configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu
 configparameters: cntk.config:speechTrain=[
     action=train
-    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
-    deviceId=Auto
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+    deviceId=0
     traceLevel=1
     SimpleNetworkBuilder=[
         layerSizes=363:512:512:132
@@ -164,6 +215,31 @@ configparameters: cntk.config:speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -195,8 +271,8 @@ configparameters: cntk.config:speechTrain=[
           scpFile=glob_0000.scp
       ]
       labels=[
-          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
-          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
           labelDim=132
           labelType=Category
       ]
@@ -206,168 +282,301 @@ configparameters: cntk.config:speechTrain=[
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: speechTrain 
 precision = float
-LockDevice: Capture device 1 and lock it for exclusive use
-LockDevice: Capture device 2 and lock it for exclusive use
-LockDevice: Capture device 3 and lock it for exclusive use
-LockDevice: Capture device 0 and lock it for exclusive use
-LockDevice: Capture device 1 and lock it for exclusive use
-SimpleNetworkBuilder Using GPU 1
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 3
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+CNTKCommandTrainBegin: speechTrain
+SimpleNetworkBuilder Using GPU 0
 reading script file glob_0000.scp ... 948 entries
 trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
-total 132 state names in state list C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
-htkmlfreader: reading MLF file C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
+htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+SetUniformRandomValue (GPU): creating curand object with seed 1
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
 
 
-Validating node CrossEntropyWithSoftmax 
+Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 3])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 3])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 3])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 3], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 3])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 3])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 3], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 3])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 3])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 3], B2[132, 1])
-Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 3], HLast[132, 3])
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 3], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 3]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 3]) -> [512, MBSize 3]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 3], B0[512, 1]) -> [512, MBSize 3]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 3], B1[512, 1]) -> [512, MBSize 3]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 3]) -> [512, MBSize 3]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 3]) -> [132, MBSize 3]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 3], B2[132, 1]) -> [132, MBSize 3]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 3], HLast[132, MBSize 3]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> 3 PreCompute nodes found.
 
-Found 3 PreCompute nodes
 	NodeName: InvStdOfFeatures
 	NodeName: MeanOfFeatures
 	NodeName: Prior
-minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0) with 1 datapasses
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
 
-Validating node InvStdOfFeatures 
+Validating for node InvStdOfFeatures. 2 nodes to process in pass 1.
 
-Validating --> features = InputValue
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 64])
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+
+Validating for node InvStdOfFeatures, final verification.
+
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 3]) -> [363, 1]
+
+1 out of 2 nodes do not share the minibatch layout with the input data.
 
 
 
-Validating node MeanOfFeatures 
+Validating for node MeanOfFeatures. 2 nodes to process in pass 1.
 
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 64])
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+
+Validating for node MeanOfFeatures, final verification.
+
+Validating --> features = InputValue -> [363, MBSize 3]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 3]) -> [363, 1]
+
+1 out of 2 nodes do not share the minibatch layout with the input data.
 
 
 
-Validating node Prior 
+Validating for node Prior. 2 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> Prior = Mean(labels[132, 64])
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+
+Validating for node Prior. 1 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+
+Validating for node Prior, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 3]
+Validating --> Prior = Mean(labels[132, MBSize 3]) -> [132, 1]
+
+1 out of 2 nodes do not share the minibatch layout with the input data.
+
+
+Precomputing --> Completed.
 
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 1: learning rate per sample = 0.015625  momentum = 0.900000 
-minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0) with 1 datapasses
+Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000 
+minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
 
 
-Validating node EvalErrorPrediction 
+Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 64])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 64])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 64], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 64])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 64], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 64])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 64])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 64], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 64])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 64])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 64], B2[132, 1])
-Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 64], HLast[132, 64])
+Validating --> labels = InputValue -> [132, MBSize 62]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 62]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
 
- Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45646143; EvalErr[0]PerSample = 0.92500001; TotalTime = 0.03190s; TotalTimePerSample = 0.04985ms; SamplesPerSecond = 20061
- Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315693; EvalErr[0]PerSample = 0.90156251; TotalTime = 0.02454s; TotalTimePerSample = 0.03835ms; SamplesPerSecond = 26075
- Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180511; EvalErr[0]PerSample = 0.84687501; TotalTime = 0.02438s; TotalTimePerSample = 0.03809ms; SamplesPerSecond = 26254
- Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94157934; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.02445s; TotalTimePerSample = 0.03820ms; SamplesPerSecond = 26181
- Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668945; EvalErr[0]PerSample = 0.91093749; TotalTime = 0.02429s; TotalTimePerSample = 0.03795ms; SamplesPerSecond = 26352
- Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866368; EvalErr[0]PerSample = 0.89531249; TotalTime = 0.02445s; TotalTimePerSample = 0.03820ms; SamplesPerSecond = 26178
- Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51809072; EvalErr[0]PerSample = 0.82968748; TotalTime = 0.02423s; TotalTimePerSample = 0.03786ms; SamplesPerSecond = 26415
- Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48454905; EvalErr[0]PerSample = 0.80781251; TotalTime = 0.02249s; TotalTimePerSample = 0.03514ms; SamplesPerSecond = 28457
- Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829641; EvalErr[0]PerSample = 0.76875001; TotalTime = 0.02169s; TotalTimePerSample = 0.03390ms; SamplesPerSecond = 29501
- Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167227; EvalErr[0]PerSample = 0.79843748; TotalTime = 0.02178s; TotalTimePerSample = 0.03403ms; SamplesPerSecond = 29386
+Validating for node EvalErrorPrediction. 10 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 62]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 62]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+
+Validating for node EvalErrorPrediction, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 62]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 62]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 62]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 62]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 62], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 62]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 62]) -> [512, MBSize 62]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 62], B0[512, 1]) -> [512, MBSize 62]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 62], B1[512, 1]) -> [512, MBSize 62]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 62]) -> [512, MBSize 62]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 62]) -> [132, MBSize 62]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 62], B2[132, 1]) -> [132, MBSize 62]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 62], HLast[132, MBSize 62]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
+
+
+Starting minibatch loop.
+ Epoch[ 1 of 3]-Minibatch[   1-  10 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.15527s; TotalTimePerSample = 0.24261ms; SamplesPerSecond = 4121
+ Epoch[ 1 of 3]-Minibatch[  11-  20 of 320]: SamplesSeen = 640; TrainLossPerSample =  4.22315750; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.17254s; TotalTimePerSample = 0.26960ms; SamplesPerSecond = 3709
+ Epoch[ 1 of 3]-Minibatch[  21-  30 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.95180664; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.16283s; TotalTimePerSample = 0.25443ms; SamplesPerSecond = 3930
+ Epoch[ 1 of 3]-Minibatch[  31-  40 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.94158020; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.15770s; TotalTimePerSample = 0.24641ms; SamplesPerSecond = 4058
+ Epoch[ 1 of 3]-Minibatch[  41-  50 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.85668945; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.17209s; TotalTimePerSample = 0.26889ms; SamplesPerSecond = 3719
+ Epoch[ 1 of 3]-Minibatch[  51-  60 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.72866364; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.16186s; TotalTimePerSample = 0.25291ms; SamplesPerSecond = 3954
+ Epoch[ 1 of 3]-Minibatch[  61-  70 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.51809235; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.15901s; TotalTimePerSample = 0.24846ms; SamplesPerSecond = 4024
+ Epoch[ 1 of 3]-Minibatch[  71-  80 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.48455200; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.15480s; TotalTimePerSample = 0.24188ms; SamplesPerSecond = 4134
+ Epoch[ 1 of 3]-Minibatch[  81-  90 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.33829346; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.15737s; TotalTimePerSample = 0.24588ms; SamplesPerSecond = 4066
+ Epoch[ 1 of 3]-Minibatch[  91- 100 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.50167236; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.15904s; TotalTimePerSample = 0.24849ms; SamplesPerSecond = 4024
 WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
- Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861624; EvalErr[0]PerSample = 0.80000001; TotalTime = 0.02166s; TotalTimePerSample = 0.03385ms; SamplesPerSecond = 29546
- Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32616878; EvalErr[0]PerSample = 0.79062498; TotalTime = 0.02063s; TotalTimePerSample = 0.03224ms; SamplesPerSecond = 31018
- Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16897583; EvalErr[0]PerSample = 0.77968752; TotalTime = 0.01950s; TotalTimePerSample = 0.03048ms; SamplesPerSecond = 32813
- Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08891916; EvalErr[0]PerSample = 0.77656251; TotalTime = 0.01961s; TotalTimePerSample = 0.03063ms; SamplesPerSecond = 32644
- Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06004953; EvalErr[0]PerSample = 0.72968751; TotalTime = 0.01950s; TotalTimePerSample = 0.03046ms; SamplesPerSecond = 32825
- Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128540; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.01965s; TotalTimePerSample = 0.03070ms; SamplesPerSecond = 32571
- Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90172124; EvalErr[0]PerSample = 0.72968751; TotalTime = 0.01828s; TotalTimePerSample = 0.02857ms; SamplesPerSecond = 35003
- Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73261714; EvalErr[0]PerSample = 0.65312499; TotalTime = 0.01799s; TotalTimePerSample = 0.02811ms; SamplesPerSecond = 35569
- Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515493; EvalErr[0]PerSample = 0.68437499; TotalTime = 0.01789s; TotalTimePerSample = 0.02796ms; SamplesPerSecond = 35766
- Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67383432; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.01792s; TotalTimePerSample = 0.02800ms; SamplesPerSecond = 35708
- Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869272; EvalErr[0]PerSample = 0.63593751; TotalTime = 0.01805s; TotalTimePerSample = 0.02821ms; SamplesPerSecond = 35451
- Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60032344; EvalErr[0]PerSample = 0.66718751; TotalTime = 0.01696s; TotalTimePerSample = 0.02650ms; SamplesPerSecond = 37738
- Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134038; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.01658s; TotalTimePerSample = 0.02591ms; SamplesPerSecond = 38598
- Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362544; EvalErr[0]PerSample = 0.63749999; TotalTime = 0.01663s; TotalTimePerSample = 0.02598ms; SamplesPerSecond = 38491
- Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640615; EvalErr[0]PerSample = 0.61562502; TotalTime = 0.01670s; TotalTimePerSample = 0.02610ms; SamplesPerSecond = 38321
- Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745474; EvalErr[0]PerSample = 0.62812501; TotalTime = 0.01672s; TotalTimePerSample = 0.02612ms; SamplesPerSecond = 38279
- Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16415405; EvalErr[0]PerSample = 0.56718749; TotalTime = 0.01621s; TotalTimePerSample = 0.02533ms; SamplesPerSecond = 39481
- Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30347300; EvalErr[0]PerSample = 0.63593751; TotalTime = 0.01583s; TotalTimePerSample = 0.02474ms; SamplesPerSecond = 40427
- Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398804; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.01579s; TotalTimePerSample = 0.02467ms; SamplesPerSecond = 40542
- Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322256; EvalErr[0]PerSample = 0.57968748; TotalTime = 0.01582s; TotalTimePerSample = 0.02472ms; SamplesPerSecond = 40447
- Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664429; EvalErr[0]PerSample = 0.59531248; TotalTime = 0.01570s; TotalTimePerSample = 0.02453ms; SamplesPerSecond = 40761
- Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246572; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.01556s; TotalTimePerSample = 0.02431ms; SamplesPerSecond = 41139
-Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836918; Ave LearnRatePerSample = 0.015625; EpochTime=0.657568
-Starting Epoch 2: learning rate per sample = 0.001953  momentum = 0.656119 
-minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480) with 1 datapasses
- Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151960; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.03143s; TotalTimePerSample = 0.01228ms; SamplesPerSecond = 81456
- Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395634; EvalErr[0]PerSample = 0.54257810; TotalTime = 0.02295s; TotalTimePerSample = 0.00896ms; SamplesPerSecond = 111561
- Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575521; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.02287s; TotalTimePerSample = 0.00893ms; SamplesPerSecond = 111951
- Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90484965; EvalErr[0]PerSample = 0.53164065; TotalTime = 0.02284s; TotalTimePerSample = 0.00892ms; SamplesPerSecond = 112069
- Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324130; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.02277s; TotalTimePerSample = 0.00889ms; SamplesPerSecond = 112448
- Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109266; EvalErr[0]PerSample = 0.53359377; TotalTime = 0.02287s; TotalTimePerSample = 0.00894ms; SamplesPerSecond = 111917
- Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496076; EvalErr[0]PerSample = 0.52890623; TotalTime = 0.02279s; TotalTimePerSample = 0.00890ms; SamplesPerSecond = 112325
- Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944366; EvalErr[0]PerSample = 0.52265626; TotalTime = 0.02265s; TotalTimePerSample = 0.00885ms; SamplesPerSecond = 113044
-Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356024; EvalErrPerSample = 0.53603518; Ave LearnRatePerSample = 0.001953125; EpochTime=0.192318
-Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
-minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses
- Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752820; EvalErr[0]PerSample = 0.52177733; TotalTime = 0.08080s; TotalTimePerSample = 0.00789ms; SamplesPerSecond = 126735
- Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358737; EvalErr[0]PerSample = 0.51542968; TotalTime = 0.05544s; TotalTimePerSample = 0.00541ms; SamplesPerSecond = 184694
-Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705578; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=0.139063
+ Epoch[ 1 of 3]-Minibatch[ 101- 110 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.22861633; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.16485s; TotalTimePerSample = 0.25757ms; SamplesPerSecond = 3882
+ Epoch[ 1 of 3]-Minibatch[ 111- 120 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.32616882; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.15116s; TotalTimePerSample = 0.23618ms; SamplesPerSecond = 4234
+ Epoch[ 1 of 3]-Minibatch[ 121- 130 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.16897583; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.15167s; TotalTimePerSample = 0.23699ms; SamplesPerSecond = 4219
+ Epoch[ 1 of 3]-Minibatch[ 131- 140 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.08891907; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.16170s; TotalTimePerSample = 0.25265ms; SamplesPerSecond = 3958
+ Epoch[ 1 of 3]-Minibatch[ 141- 150 of 320]: SamplesSeen = 640; TrainLossPerSample =  3.06005249; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.15522s; TotalTimePerSample = 0.24254ms; SamplesPerSecond = 4123
+ Epoch[ 1 of 3]-Minibatch[ 151- 160 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.91128540; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.15756s; TotalTimePerSample = 0.24618ms; SamplesPerSecond = 4062
+ Epoch[ 1 of 3]-Minibatch[ 161- 170 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.90172119; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.15992s; TotalTimePerSample = 0.24987ms; SamplesPerSecond = 4002
+ Epoch[ 1 of 3]-Minibatch[ 171- 180 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.73261719; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.16060s; TotalTimePerSample = 0.25093ms; SamplesPerSecond = 3985
+ Epoch[ 1 of 3]-Minibatch[ 181- 190 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.66515503; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.15478s; TotalTimePerSample = 0.24184ms; SamplesPerSecond = 4134
+ Epoch[ 1 of 3]-Minibatch[ 191- 200 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.67383423; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.14563s; TotalTimePerSample = 0.22755ms; SamplesPerSecond = 4394
+ Epoch[ 1 of 3]-Minibatch[ 201- 210 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.52869263; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.15331s; TotalTimePerSample = 0.23955ms; SamplesPerSecond = 4174
+ Epoch[ 1 of 3]-Minibatch[ 211- 220 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.60032349; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.15816s; TotalTimePerSample = 0.24713ms; SamplesPerSecond = 4046
+ Epoch[ 1 of 3]-Minibatch[ 221- 230 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.51134033; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.16587s; TotalTimePerSample = 0.25917ms; SamplesPerSecond = 3858
+ Epoch[ 1 of 3]-Minibatch[ 231- 240 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.45362549; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.15854s; TotalTimePerSample = 0.24772ms; SamplesPerSecond = 4036
+ Epoch[ 1 of 3]-Minibatch[ 241- 250 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.41640015; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.15948s; TotalTimePerSample = 0.24919ms; SamplesPerSecond = 4013
+ Epoch[ 1 of 3]-Minibatch[ 251- 260 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.39745483; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.16179s; TotalTimePerSample = 0.25280ms; SamplesPerSecond = 3955
+ Epoch[ 1 of 3]-Minibatch[ 261- 270 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.16415405; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.16235s; TotalTimePerSample = 0.25367ms; SamplesPerSecond = 3942
+ Epoch[ 1 of 3]-Minibatch[ 271- 280 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.30347290; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.15271s; TotalTimePerSample = 0.23861ms; SamplesPerSecond = 4190
+ Epoch[ 1 of 3]-Minibatch[ 281- 290 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.24398804; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.16522s; TotalTimePerSample = 0.25815ms; SamplesPerSecond = 3873
+ Epoch[ 1 of 3]-Minibatch[ 291- 300 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.15322266; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.15988s; TotalTimePerSample = 0.24982ms; SamplesPerSecond = 4002
+ Epoch[ 1 of 3]-Minibatch[ 301- 310 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.21664429; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.14906s; TotalTimePerSample = 0.23290ms; SamplesPerSecond = 4293
+ Epoch[ 1 of 3]-Minibatch[ 311- 320 of 320]: SamplesSeen = 640; TrainLossPerSample =  2.25246582; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.14161s; TotalTimePerSample = 0.22126ms; SamplesPerSecond = 4519
+Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; EvalErrPerSample = 0.72836918; Ave LearnRatePerSample = 0.015625; EpochTime=5.105428
+Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119 
+minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 2 of 3]-Minibatch[   1-  10 of 80]: SamplesSeen = 2560; TrainLossPerSample =  2.08151951; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.35121s; TotalTimePerSample = 0.13719ms; SamplesPerSecond = 7289
+ Epoch[ 2 of 3]-Minibatch[  11-  20 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98395710; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.31404s; TotalTimePerSample = 0.12267ms; SamplesPerSecond = 8151
+ Epoch[ 2 of 3]-Minibatch[  21-  30 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.98575516; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.27053s; TotalTimePerSample = 0.10567ms; SamplesPerSecond = 9463
+ Epoch[ 2 of 3]-Minibatch[  31-  40 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.90485039; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.24565s; TotalTimePerSample = 0.09596ms; SamplesPerSecond = 10421
+ Epoch[ 2 of 3]-Minibatch[  41-  50 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.88324280; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.22956s; TotalTimePerSample = 0.08967ms; SamplesPerSecond = 11151
+ Epoch[ 2 of 3]-Minibatch[  51-  60 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89109344; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.22156s; TotalTimePerSample = 0.08655ms; SamplesPerSecond = 11554
+ Epoch[ 2 of 3]-Minibatch[  61-  70 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.89496155; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.21987s; TotalTimePerSample = 0.08589ms; SamplesPerSecond = 11643
+ Epoch[ 2 of 3]-Minibatch[  71-  80 of 80]: SamplesSeen = 2560; TrainLossPerSample =  1.85944366; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.19881s; TotalTimePerSample = 0.07766ms; SamplesPerSecond = 12876
+Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.935603; EvalErrPerSample = 0.53603518; Ave LearnRatePerSample = 0.001953125; EpochTime=2.098193
+Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752853; EvalErr[0]PerSample = 0.52177734; TotalTime = 0.71783s; TotalTimePerSample = 0.07010ms; SamplesPerSecond = 14265
+ Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358818; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.60551s; TotalTimePerSample = 0.05913ms; SamplesPerSecond = 16911
+Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=1.428405
+CNTKCommandTrainEnd: speechTrain
 COMPLETED
 === Deleting last epoch data
 ==== Re-running from checkpoint
+=== Running /cygdrive/e/NetScale/CNTK/git_repos/cplx_master2/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E DeviceId=0
 -------------------------------------------------------------------
 Build info: 
 
-		Built time: Aug 11 2015 16:18:17
-		Last modified date: Tue Aug 11 16:16:08 2015
-		Built by svcphil on dphaim-26-new           
-		Build Path: C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\MachineLearning\CNTK\
+		Built time: Oct 24 2015 13:33:25
+		Last modified date: Thu Oct 22 16:00:27 2015
+		Built by amitaga on Amitaga-Win-DT3           
+		Build Path: E:\NetScale\CNTK\git_repos\cplx_master2\MachineLearning\CNTK\
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
-		Build Branch: master
-		Build SHA1: 397cc7cc16c00b1c12864d331c0729fde7a1bde3
 -------------------------------------------------------------------
-running on dphaim-26-new at 2015/08/11 17:47:34
-command line options: 
-configFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\QuickE2E\cntk.config RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data DeviceId=Auto 
+running on Amitaga-Win-DT3 at 2015/10/24 22:08:20
+command line: 
+E:\NetScale\CNTK\git_repos\cplx_master2\x64\debug\cntk.exe configFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E/cntk.config RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E DeviceId=0 
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
 precision=float
@@ -389,6 +598,31 @@ speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -427,21 +661,22 @@ speechTrain=[
       ]
     ]
 ]
-RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
-DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
-DeviceId=Auto
+RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E
+DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 precision=float
 command=speechTrain
-deviceId=Auto
+deviceId=0
 parallelTrain=false
 speechTrain=[
     action=train
-    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
-    deviceId=Auto
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+    deviceId=0
     traceLevel=1
     SimpleNetworkBuilder=[
         layerSizes=363:512:512:132
@@ -453,6 +688,31 @@ speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -484,30 +744,32 @@ speechTrain=[
           scpFile=glob_0000.scp
       ]
       labels=[
-          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
-          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
           labelDim=132
           labelType=Category
       ]
     ]
 ]
-RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
-DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
-DeviceId=Auto
+RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E
+DeviceId=0
 
 <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
 >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: cntk.config:command=speechTrain
-configparameters: cntk.config:DataDir=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data
-configparameters: cntk.config:deviceId=Auto
+configparameters: cntk.config:ConfigDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\QuickE2E
+configparameters: cntk.config:DataDir=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data
+configparameters: cntk.config:deviceId=0
 configparameters: cntk.config:parallelTrain=false
 configparameters: cntk.config:precision=float
-configparameters: cntk.config:RunDir=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu
+configparameters: cntk.config:RunDir=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu
 configparameters: cntk.config:speechTrain=[
     action=train
-    modelPath=C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn
-    deviceId=Auto
+    modelPath=C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+    deviceId=0
     traceLevel=1
     SimpleNetworkBuilder=[
         layerSizes=363:512:512:132
@@ -519,6 +781,31 @@ configparameters: cntk.config:speechTrain=[
         uniformInit=true
         needPrior=true
     ]
+    ExperimentalNetworkBuilder=[    // the same as above but with BS
+        layerSizes=363:512:512:132
+        trainingCriterion='CE'
+        evalCriterion='Err'
+        applyMeanVarNorm=true
+        L = Length(layerSizes)-1    // number of model layers
+        features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label')
+        featNorm = if applyMeanVarNorm
+                   then MeanVarNorm(features)
+                   else features
+        layers[layer:1..L-1] = if layer > 1
+                               then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1])
+                               else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1])
+        outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1])
+        outZ = outLayer.z        // + PastValue(layerSizes[L], 1, outLayer.z)
+        CE = if trainingCriterion == 'CE'
+             then CrossEntropyWithSoftmax(labels, outZ, tag='criterion')
+             else Fail('unknown trainingCriterion ' + trainingCriterion)
+        Err = if evalCriterion == 'Err' then
+              ErrorPrediction(labels, outZ, tag='eval')
+              else Fail('unknown evalCriterion ' + evalCriterion)
+        logPrior = LogPrior(labels)
+        // TODO: how to add a tag to an infix operation?
+        ScaledLogLikelihood = Minus (outZ, logPrior, tag='output')
+    ]
     SGD=[
         epochSize=20480
         minibatchSize=64:256:1024:
@@ -550,8 +837,8 @@ configparameters: cntk.config:speechTrain=[
           scpFile=glob_0000.scp
       ]
       labels=[
-          mlfFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf
-          labelMappingFile=C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
+          mlfFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf
+          labelMappingFile=E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
           labelDim=132
           labelType=Category
       ]
@@ -561,25 +848,24 @@ configparameters: cntk.config:speechTrain=[
 <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 command: speechTrain 
 precision = float
-LockDevice: Capture device 1 and lock it for exclusive use
-LockDevice: Capture device 2 and lock it for exclusive use
-LockDevice: Capture device 3 and lock it for exclusive use
-LockDevice: Capture device 0 and lock it for exclusive use
-LockDevice: Capture device 1 and lock it for exclusive use
-SimpleNetworkBuilder Using GPU 1
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 3
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+CNTKCommandTrainBegin: speechTrain
+SimpleNetworkBuilder Using GPU 0
 reading script file glob_0000.scp ... 948 entries
 trainlayer: OOV-exclusion code enabled, but no unigram specified to derive the word set from, so you won't get OOV exclusion
-total 132 state names in state list C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/state.list
-htkmlfreader: reading MLF file C:\Users\svcphil\workspace.vlivan\CNTK-Build-Windows\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
+total 132 state names in state list E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/state.list
+htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\cplx_master2\Tests\Speech\Data/glob_0000.mlf ... total 948 entries
 ...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
 label set 0: 129 classes
 minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
-Starting from checkpoint. Load Network From File C:\Users\svcphil\AppData\Local\Temp\2\cntk-test-20150811174551.851046\Speech_QuickE2E@release_gpu/models/cntkSpeech.dnn.2.
+Starting from checkpoint. Load Network From File C:\cygwin64\tmp\cntk-test-20151024140721.927553\Speech_QuickE2E@debug_gpu/models/cntkSpeech.dnn.2.
 
 
 Printing Gradient Computation Node Order ... 
 
-CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 256], HLast[0, 0])
+CrossEntropyWithSoftmax[0, 0] = CrossEntropyWithSoftmax(labels[132, 0], HLast[0, 0])
 HLast[0, 0] = Plus(W2*H1[0, 0], B2[132, 1])
 B2[132, 1] = LearnableParameter
 W2*H1[0, 0] = Times(W2[132, 512], H2[0, 0])
@@ -591,148 +877,474 @@ H1[0, 0] = Sigmoid(W0*features+B0[0, 0])
 W0*features+B0[0, 0] = Plus(W0*features[0, 0], B0[512, 1])
 B0[512, 1] = LearnableParameter
 W0*features[0, 0] = Times(W0[512, 363], MVNormalizedFeatures[0, 0])
-MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-InvStdOfFeatures[363, 1] = InvStdDev(features[363, 256])
-MeanOfFeatures[363, 1] = Mean(features[363, 256])
-features[363, 256] = InputValue
+MVNormalizedFeatures[0, 0] = PerDimMeanVarNormalization(features[363, 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
+InvStdOfFeatures[363, 1] = InvStdDev(features[363, 0])
+MeanOfFeatures[363, 1] = Mean(features[363, 0])
+features[363, 0] = InputValue
 W0[512, 363] = LearnableParameter
 W1[512, 512] = LearnableParameter
 W2[132, 512] = LearnableParameter
-labels[132, 256] = InputValue
+labels[132, 0] = InputValue
 
-Validating node CrossEntropyWithSoftmax 
+Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 256])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
-Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256])
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax. 12 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
 
 
 
-Validating node ScaledLogLikelihood 
+Validating for node CrossEntropyWithSoftmax. 20 nodes to process in pass 1.
 
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 256])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
-Validating --> labels = InputValue
-Validating --> Prior = Mean(labels[132, 256])
-Validating --> LogOfPrior = Log(Prior[132, 1])
-Validating --> ScaledLogLikelihood = Minus(HLast[132, 256], LogOfPrior[132, 1])
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node CrossEntropyWithSoftmax, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
 
 
 
-Validating node EvalErrorPrediction 
+Validating for node ScaledLogLikelihood. 22 nodes to process in pass 1.
 
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 256])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
-Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 256], HLast[132, 256])
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+Validating for node ScaledLogLikelihood. 11 nodes to process in pass 2.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+Validating for node ScaledLogLikelihood, final verification.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+10 out of 22 nodes do not share the minibatch layout with the input data.
+
+
+
+Validating for node ScaledLogLikelihood. 22 nodes to process in pass 1.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+Validating for node ScaledLogLikelihood. 9 nodes to process in pass 2.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+Validating for node ScaledLogLikelihood, final verification.
+
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> Prior = Mean(labels[132, MBSize 0]) -> [132, 1]
+Validating --> LogOfPrior = Log(Prior[132, 1]) -> [132, 1]
+Validating --> ScaledLogLikelihood = Minus(HLast[132, MBSize 0], LogOfPrior[132, 1]) -> [132, MBSize 0]
+
+10 out of 22 nodes do not share the minibatch layout with the input data.
+
+
+
+Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node EvalErrorPrediction. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node EvalErrorPrediction, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
+
+
+
+Validating for node EvalErrorPrediction. 20 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node EvalErrorPrediction. 9 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+Validating for node EvalErrorPrediction, final verification.
+
+Validating --> labels = InputValue -> [132, MBSize 0]
+Validating --> W2 = LearnableParameter -> [132, 512]
+Validating --> W1 = LearnableParameter -> [512, 512]
+Validating --> W0 = LearnableParameter -> [512, 363]
+Validating --> features = InputValue -> [363, MBSize 0]
+Validating --> MeanOfFeatures = Mean(features[363, MBSize 0]) -> [363, 1]
+Validating --> InvStdOfFeatures = InvStdDev(features[363, MBSize 0]) -> [363, 1]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, MBSize 0], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1]) -> [363, MBSize 0]
+Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, MBSize 0]) -> [512, MBSize 0]
+Validating --> B0 = LearnableParameter -> [512, 1]
+Validating --> W0*features+B0 = Plus(W0*features[512, MBSize 0], B0[512, 1]) -> [512, MBSize 0]
+Validating --> H1 = Sigmoid(W0*features+B0[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W1*H1 = Times(W1[512, 512], H1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> B1 = LearnableParameter -> [512, 1]
+Validating --> W1*H1+B1 = Plus(W1*H1[512, MBSize 0], B1[512, 1]) -> [512, MBSize 0]
+Validating --> H2 = Sigmoid(W1*H1+B1[512, MBSize 0]) -> [512, MBSize 0]
+Validating --> W2*H1 = Times(W2[132, 512], H2[512, MBSize 0]) -> [132, MBSize 0]
+Validating --> B2 = LearnableParameter -> [132, 1]
+Validating --> HLast = Plus(W2*H1[132, MBSize 0], B2[132, 1]) -> [132, MBSize 0]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, MBSize 0], HLast[132, MBSize 0]) -> [1, 1]
+
+9 out of 20 nodes do not share the minibatch layout with the input data.
 
 GetTrainCriterionNodes  ...
 GetEvalCriterionNodes  ...
-
-
-Validating node CrossEntropyWithSoftmax 
-
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 256])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 256])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 256], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 256])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 256], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 256])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 256])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 256], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 256])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 256])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 256], B2[132, 1])
-Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132, 256], HLast[132, 256])
-
 No PreCompute nodes found, skipping PreCompute step
 Set Max Temp Mem Size For Convolution Nodes to 0 samples.
-Starting Epoch 3: learning rate per sample = 0.000098  momentum = 0.656119 
-minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960) with 1 datapasses
+Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119 
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses
 requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
 
-
-Validating node EvalErrorPrediction 
-
-Validating --> labels = InputValue
-Validating --> W2 = LearnableParameter
-Validating --> W1 = LearnableParameter
-Validating --> W0 = LearnableParameter
-Validating --> features = InputValue
-Validating --> MeanOfFeatures = Mean(features[363, 1024])
-Validating --> InvStdOfFeatures = InvStdDev(features[363, 1024])
-Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363, 1024], MeanOfFeatures[363, 1], InvStdOfFeatures[363, 1])
-Validating --> W0*features = Times(W0[512, 363], MVNormalizedFeatures[363, 1024])
-Validating --> B0 = LearnableParameter
-Validating --> W0*features+B0 = Plus(W0*features[512, 1024], B0[512, 1])
-Validating --> H1 = Sigmoid(W0*features+B0[512, 1024])
-Validating --> W1*H1 = Times(W1[512, 512], H1[512, 1024])
-Validating --> B1 = LearnableParameter
-Validating --> W1*H1+B1 = Plus(W1*H1[512, 1024], B1[512, 1])
-Validating --> H2 = Sigmoid(W1*H1+B1[512, 1024])
-Validating --> W2*H1 = Times(W2[132, 512], H2[512, 1024])
-Validating --> B2 = LearnableParameter
-Validating --> HLast = Plus(W2*H1[132, 1024], B2[132, 1])
-Validating --> EvalErrorPrediction = ErrorPrediction(labels[132, 1024], HLast[132, 1024])
-
- Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752820; EvalErr[0]PerSample = 0.52177733; TotalTime = 0.42093s; TotalTimePerSample = 0.04111ms; SamplesPerSecond = 24327
- Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358737; EvalErr[0]PerSample = 0.51542968; TotalTime = 0.05521s; TotalTimePerSample = 0.00539ms; SamplesPerSecond = 185480
-Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705578; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=0.690137
+Starting minibatch loop.
+ Epoch[ 3 of 3]-Minibatch[   1-  10 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.86752853; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.50756s; TotalTimePerSample = 0.14722ms; SamplesPerSecond = 6792
+ Epoch[ 3 of 3]-Minibatch[  11-  20 of 20]: SamplesSeen = 10240; TrainLossPerSample =  1.87358818; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.86938s; TotalTimePerSample = 0.08490ms; SamplesPerSecond = 11778
+Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705584; EvalErrPerSample = 0.5186035; Ave LearnRatePerSample = 9.765625146e-005; EpochTime=6.283729
+CNTKCommandTrainEnd: speechTrain
 COMPLETED